From 495ed8c2a7d9383850b4be08a2a9de8695b3009d Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Mon, 4 May 2026 18:59:41 +0000 Subject: [PATCH 1/8] fix(mcp): relax column name regex, improve generate_chart validation errors and examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove overly strict regex pattern from ColumnRef.name, FilterConfig.column, and BigNumberChartConfig.temporal_column — sanitize_name/sanitize_column already handle XSS/SQL injection; the pattern rejected valid column names like "1Q_revenue" (digit-prefixed) or "order-date" (hyphenated) - Extend generate_chart docstring with usage examples for all supported chart types: pie, big_number (with/without trendline), pivot_table, mixed_timeseries, handlebars - Improve _enhance_validation_error fallback in SchemaValidator to produce type-specific, actionable messages instead of raw pydantic error strings (extract _format_single_error helper to reduce cyclomatic complexity) - Add tests verifying digit-prefixed/hyphenated column names now pass, and that XSS/SQL injection is still blocked by sanitize_name() --- superset/mcp_service/chart/schemas.py | 15 +++- .../mcp_service/chart/tool/generate_chart.py | 80 +++++++++++++++++++ .../chart/validation/schema_validator.py | 61 +++++++++++--- 3 files changed, 142 insertions(+), 14 deletions(-) diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index 1fdb4f43ab6d..32dc0c586bb6 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -669,7 +669,10 @@ class ColumnRef(BaseModel): ..., min_length=1, max_length=255, - pattern=r"^[a-zA-Z0-9_][a-zA-Z0-9_\s\-\.]*$", + # No regex pattern: sanitize_name() already blocks XSS/SQL injection; + # many valid column names (digit-prefixed, locale chars, etc.) would + # be rejected by a strict pattern while posing no security risk. + # Use get_dataset_info to find exact column names. validation_alias=AliasChoices("name", "column_name"), ) label: str | None = Field(None, max_length=500) @@ -743,7 +746,10 @@ class FilterConfig(BaseModel): ..., min_length=1, max_length=255, - pattern=r"^[a-zA-Z0-9_][a-zA-Z0-9_\s\-\.]*$", + # No regex pattern: sanitize_name() already blocks XSS/SQL injection; + # many valid column names (digit-prefixed, locale chars, etc.) would + # be rejected by a strict pattern while posing no security risk. + # Use get_dataset_info to find exact column names. validation_alias=AliasChoices("column", "col"), ) op: Literal[ @@ -1082,7 +1088,10 @@ class BigNumberChartConfig(UnknownFieldCheckMixin): ), min_length=1, max_length=255, - pattern=r"^[a-zA-Z0-9_][a-zA-Z0-9_\s\-\.]*$", + # No regex pattern: sanitize_name() already blocks XSS/SQL injection; + # many valid column names (digit-prefixed, locale chars, etc.) would + # be rejected by a strict pattern while posing no security risk. + # Use get_dataset_info to find exact column names. ) time_grain: TimeGrain | None = Field( None, diff --git a/superset/mcp_service/chart/tool/generate_chart.py b/superset/mcp_service/chart/tool/generate_chart.py index 646ac4d4c2a7..366685d22d3c 100644 --- a/superset/mcp_service/chart/tool/generate_chart.py +++ b/superset/mcp_service/chart/tool/generate_chart.py @@ -200,6 +200,86 @@ async def generate_chart( # noqa: C901 } ``` + + Example usage for Pie chart: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "pie", + "dimension": {"name": "product_category"}, + "metric": {"name": "revenue", "aggregate": "SUM"}, + "donut": false + } + } + ``` + + Example usage for Big Number (no trendline): + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "big_number", + "metric": {"name": "total_sales", "aggregate": "SUM"} + } + } + ``` + + Example usage for Big Number with trendline: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "big_number", + "metric": {"name": "revenue", "aggregate": "SUM"}, + "temporal_column": "order_date", + "time_grain": "P1M", + "show_trendline": true + } + } + ``` + + Example usage for Pivot Table: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "pivot_table", + "rows": [{"name": "region"}], + "columns": [{"name": "product_category"}], + "metrics": [{"name": "revenue", "aggregate": "SUM"}] + } + } + ``` + + Example usage for Mixed Timeseries: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "mixed_timeseries", + "x": {"name": "order_date"}, + "y": [{"name": "revenue", "aggregate": "SUM"}], + "primary_kind": "line", + "y_secondary": [{"name": "order_count", "aggregate": "COUNT"}], + "secondary_kind": "bar" + } + } + ``` + + Example usage for Handlebars: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "handlebars", + "handlebars_template": "{{#each data}}{{this.name}}{{/each}}", + "groupby": [{"name": "product"}], + "metrics": [{"name": "revenue", "aggregate": "SUM"}] + } + } + ``` + Example usage for Table chart: ```json { diff --git a/superset/mcp_service/chart/validation/schema_validator.py b/superset/mcp_service/chart/validation/schema_validator.py index 7cae450ff599..dc0eec9958d6 100644 --- a/superset/mcp_service/chart/validation/schema_validator.py +++ b/superset/mcp_service/chart/validation/schema_validator.py @@ -525,6 +525,38 @@ def _pre_validate_mixed_timeseries_config( return True, None + @staticmethod + def _format_single_error(err: Dict[str, Any]) -> tuple[str, str]: + """Return (detail_message, optional_suggestion) for one pydantic error.""" + loc_parts = [str(p) for p in err.get("loc", [])] + loc = " -> ".join(loc_parts) + msg = err.get("msg", "Validation failed") + err_type = err.get("type", "") + ctx = err.get("ctx", {}) or {} + field = loc_parts[-1] if loc_parts else "field" + + if err_type == "string_pattern_mismatch": + return ( + f"'{field}' value contains disallowed characters. " + "Column names must not contain HTML, script tags, or SQL " + "injection patterns. Use the exact column name from your dataset.", + "Use get_dataset_info to find exact column names", + ) + if err_type == "literal_error": + expected = ctx.get("expected", "") + return f"'{field}' has an invalid value. Expected one of: {expected}", "" + if err_type == "missing": + return ( + f"Required field '{field}' is missing", + "Check the chart_type examples in the tool description", + ) + if err_type == "value_error": + return ( + f"{loc}: {msg}", + "Use get_dataset_info to verify column names and types", + ) + return f"{loc}: {msg}", "" + @staticmethod def _enhance_validation_error( error: PydanticValidationError, request_data: Dict[str, Any] @@ -609,22 +641,29 @@ def _enhance_validation_error( error_code="BIG_NUMBER_VALIDATION_ERROR", ) - # Default enhanced error + # Default enhanced error: build actionable per-field messages error_details = [] - for err in errors[:3]: # Show first 3 errors - loc = " -> ".join(str(location) for location in err.get("loc", [])) - msg = err.get("msg", "Validation failed") - error_details.append(f"{loc}: {msg}") + extra_suggestions: list[str] = [] + for err in errors[:5]: # Surface up to 5 errors + detail, suggestion = SchemaValidator._format_single_error(err) + error_details.append(detail) + if suggestion: + extra_suggestions.append(suggestion) return ChartGenerationError( error_type="validation_error", message="Chart configuration validation failed", details="; ".join(error_details), - suggestions=[ - "Check that all required fields are present", - "Ensure field types match the schema", - "Use get_dataset_info to verify column names", - "Refer to the API documentation for field requirements", - ], + suggestions=list( + dict.fromkeys( + [ + "Check that all required fields are present", + "Ensure field types match the schema", + "Use get_dataset_info to verify column names", + "Refer to the API documentation for field requirements", + ] + + extra_suggestions + ) + ), error_code="VALIDATION_ERROR", ) From 52189106ce619fddec3090593691da04195cc317 Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Mon, 4 May 2026 19:00:11 +0000 Subject: [PATCH 2/8] test(mcp): add tests for relaxed column name validation in generate_chart --- .../mcp_service/chart/test_chart_schemas.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 1da9d1bc3a74..5ae7a38fc4ac 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -778,3 +778,85 @@ def test_client_warnings_discarded_even_when_server_also_warns(self) -> None: assert len(req.sanitization_warnings) == 1 assert "chart_name" in req.sanitization_warnings[0] assert "injected" not in req.sanitization_warnings[0] + + +class TestColumnRefNameRelaxedPattern: + """ColumnRef.name no longer enforces a strict regex pattern. + + Many valid database column names were previously rejected: + - Names starting with a digit (e.g. "1Q_revenue") + - Names with locale-specific characters + The field_validator sanitize_name() still blocks XSS and SQL injection. + """ + + def test_digit_prefixed_name_accepted(self) -> None: + """Column names starting with a digit must now be accepted.""" + col = ColumnRef(name="1Q_revenue") + assert col.name == "1Q_revenue" + + def test_name_with_hyphen_accepted(self) -> None: + col = ColumnRef(name="order-date") + assert col.name == "order-date" + + def test_name_with_dot_accepted(self) -> None: + col = ColumnRef(name="schema.column") + assert col.name == "schema.column" + + def test_name_with_spaces_accepted(self) -> None: + col = ColumnRef(name="Total Revenue") + assert col.name == "Total Revenue" + + def test_xss_attempt_blocked(self) -> None: + """sanitize_name() still blocks XSS even without the regex.""" + with pytest.raises(ValidationError): + ColumnRef(name="") + + def test_sql_keyword_blocked(self) -> None: + """check_sql_keywords=True still blocks pure SQL statements.""" + with pytest.raises(ValidationError): + ColumnRef(name="1; DROP TABLE users; --") + + def test_empty_name_blocked(self) -> None: + with pytest.raises(ValidationError): + ColumnRef(name="") + + def test_table_chart_with_digit_prefixed_column(self) -> None: + """End-to-end: digit-prefixed column passes through GenerateChartRequest.""" + req = GenerateChartRequest( + dataset_id=1, + config={ + "chart_type": "table", + "columns": [ + {"name": "1Q_revenue"}, + {"name": "product_name"}, + ], + }, + ) + assert req.config.chart_type == "table" + + def test_xy_chart_with_hyphenated_column(self) -> None: + req = GenerateChartRequest( + dataset_id=1, + config={ + "chart_type": "xy", + "x": {"name": "order-date"}, + "y": [{"name": "1Q-revenue", "aggregate": "SUM"}], + }, + ) + assert req.config.chart_type == "xy" + + +class TestFilterConfigColumnRelaxedPattern: + """FilterConfig.column no longer enforces a strict regex pattern.""" + + def test_digit_prefixed_filter_column_accepted(self) -> None: + from superset.mcp_service.chart.schemas import FilterConfig + + f = FilterConfig(column="1Q_flag", op="=", value="active") + assert f.column == "1Q_flag" + + def test_hyphenated_filter_column_accepted(self) -> None: + from superset.mcp_service.chart.schemas import FilterConfig + + f = FilterConfig(column="order-status", op="=", value="shipped") + assert f.column == "order-status" From 38972f0150f86f99cea17f85d1371722f6fd50ac Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Tue, 5 May 2026 06:21:49 +0000 Subject: [PATCH 3/8] fix(mcp): address review feedback on generate_chart schema rigidity PR - FilterConfig.column: add check_sql_keywords=True to sanitize_column (Copilot review: sanitize_column was missing SQL keyword checking) - BigNumberChartConfig.temporal_column: add sanitize_temporal_column field_validator using sanitize_user_input with check_sql_keywords=True (Copilot review: no validator after regex removal left field unprotected) - generate_chart docstring IMPORTANT: list all chart types, not just xy/table (Copilot review: IMPORTANT section was misleading after adding more examples) - Fix test_xss_attempt_blocked: nh3 strips HTML tags instead of rejecting, so rename to test_xss_tags_are_stripped (asserts tag is removed) and add test_event_handler_injection_blocked (on...= patterns ARE rejected) - Fix _format_single_error literal_error: preserve pydantic 'Input should be' message instead of replacing with custom format (broke existing test test_non_value_error_pydantic_body_is_surfaced) - Add test_sql_injection_in_filter_column_blocked to verify FilterConfig now rejects SQL injection column names --- superset/mcp_service/chart/schemas.py | 20 ++++++++++++++----- .../mcp_service/chart/tool/generate_chart.py | 3 ++- .../chart/validation/schema_validator.py | 6 +++--- .../mcp_service/chart/test_chart_schemas.py | 20 ++++++++++++++++--- 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index 32dc0c586bb6..9632bec2b23f 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -781,7 +781,9 @@ def sanitize_column(cls, v: str) -> str: """Sanitize filter column name to prevent injection attacks.""" # sanitize_user_input raises ValueError when allow_empty=False (default) # so the return value is guaranteed to be a non-None str - return sanitize_user_input(v, "Filter column", max_length=255) # type: ignore[return-value] + return sanitize_user_input( # type: ignore[return-value] + v, "Filter column", max_length=255, check_sql_keywords=True + ) @field_validator("value") @classmethod @@ -1088,11 +1090,19 @@ class BigNumberChartConfig(UnknownFieldCheckMixin): ), min_length=1, max_length=255, - # No regex pattern: sanitize_name() already blocks XSS/SQL injection; - # many valid column names (digit-prefixed, locale chars, etc.) would - # be rejected by a strict pattern while posing no security risk. - # Use get_dataset_info to find exact column names. + # No regex pattern — see field description above. ) + + @field_validator("temporal_column") + @classmethod + def sanitize_temporal_column(cls, v: str | None) -> str | None: + """Sanitize temporal column name to prevent XSS and SQL injection.""" + if v is None: + return None + return sanitize_user_input( # type: ignore[return-value] + v, "Temporal column", max_length=255, check_sql_keywords=True + ) + time_grain: TimeGrain | None = Field( None, description=( diff --git a/superset/mcp_service/chart/tool/generate_chart.py b/superset/mcp_service/chart/tool/generate_chart.py index 366685d22d3c..697698bbda03 100644 --- a/superset/mcp_service/chart/tool/generate_chart.py +++ b/superset/mcp_service/chart/tool/generate_chart.py @@ -175,7 +175,8 @@ async def generate_chart( # noqa: C901 - Set save_chart=True to permanently save the chart - LLM clients MUST display returned chart URL to users - Use numeric dataset ID or UUID (NOT schema.table_name format) - - MUST include chart_type in config (either 'xy' or 'table') + - MUST include chart_type in config (one of: 'xy', 'table', 'pie', + 'big_number', 'pivot_table', 'mixed_timeseries', 'handlebars') IMPORTANT: The 'chart_type' field in the config is a DISCRIMINATOR that determines which chart configuration schema to use. It MUST be included and MUST match the diff --git a/superset/mcp_service/chart/validation/schema_validator.py b/superset/mcp_service/chart/validation/schema_validator.py index dc0eec9958d6..5fc5af709385 100644 --- a/superset/mcp_service/chart/validation/schema_validator.py +++ b/superset/mcp_service/chart/validation/schema_validator.py @@ -532,7 +532,6 @@ def _format_single_error(err: Dict[str, Any]) -> tuple[str, str]: loc = " -> ".join(loc_parts) msg = err.get("msg", "Validation failed") err_type = err.get("type", "") - ctx = err.get("ctx", {}) or {} field = loc_parts[-1] if loc_parts else "field" if err_type == "string_pattern_mismatch": @@ -543,8 +542,9 @@ def _format_single_error(err: Dict[str, Any]) -> tuple[str, str]: "Use get_dataset_info to find exact column names", ) if err_type == "literal_error": - expected = ctx.get("expected", "") - return f"'{field}' has an invalid value. Expected one of: {expected}", "" + # Preserve the pydantic message ("Input should be ...") which is + # already human-readable; just prefix with the field name for context. + return f"'{field}': {msg}", "" if err_type == "missing": return ( f"Required field '{field}' is missing", diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 5ae7a38fc4ac..68035308a141 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -806,10 +806,17 @@ def test_name_with_spaces_accepted(self) -> None: col = ColumnRef(name="Total Revenue") assert col.name == "Total Revenue" - def test_xss_attempt_blocked(self) -> None: - """sanitize_name() still blocks XSS even without the regex.""" + def test_xss_tags_are_stripped(self) -> None: + """sanitize_name() strips HTML tags via nh3 rather than rejecting them. + The dangerous payload is neutralized; the safe text content is preserved.""" + col = ColumnRef(name="") + assert "") + ColumnRef(name="col onclick=alert(1)") def test_sql_keyword_blocked(self) -> None: """check_sql_keywords=True still blocks pure SQL statements.""" @@ -860,3 +867,10 @@ def test_hyphenated_filter_column_accepted(self) -> None: f = FilterConfig(column="order-status", op="=", value="shipped") assert f.column == "order-status" + + def test_sql_injection_in_filter_column_blocked(self) -> None: + """FilterConfig.sanitize_column uses check_sql_keywords=True.""" + from superset.mcp_service.chart.schemas import FilterConfig + + with pytest.raises(ValidationError): + FilterConfig(column="col; DROP TABLE users; --", op="=", value="x") From a66b8d66472cc854dd7b6463003766cc72477965 Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Tue, 5 May 2026 18:23:26 +0000 Subject: [PATCH 4/8] fix(mcp): remove unused type-ignore, fix test_script_tag assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unused 'type: ignore[return-value]' from sanitize_temporal_column (mypy correctly infers the return type; comment was unnecessary) - Fix test_xss_tags_are_stripped → test_script_tag_blocked: nh3 strips the entire script element including its content, leaving an empty string that the allow_empty=False guard then rejects with ValidationError --- superset/mcp_service/chart/schemas.py | 2 +- .../mcp_service/chart/test_chart_schemas.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index 9632bec2b23f..b09b6dc0556e 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -1099,7 +1099,7 @@ def sanitize_temporal_column(cls, v: str | None) -> str | None: """Sanitize temporal column name to prevent XSS and SQL injection.""" if v is None: return None - return sanitize_user_input( # type: ignore[return-value] + return sanitize_user_input( v, "Temporal column", max_length=255, check_sql_keywords=True ) diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 68035308a141..19d06601912f 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -806,12 +806,12 @@ def test_name_with_spaces_accepted(self) -> None: col = ColumnRef(name="Total Revenue") assert col.name == "Total Revenue" - def test_xss_tags_are_stripped(self) -> None: - """sanitize_name() strips HTML tags via nh3 rather than rejecting them. - The dangerous payload is neutralized; the safe text content is preserved.""" - col = ColumnRef(name="") - assert "") def test_event_handler_injection_blocked(self) -> None: """sanitize_name() rejects event-handler injection patterns (on...=).""" From 3527738a01695d8c296ecfb1deaf44d779031c08 Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Wed, 6 May 2026 20:56:49 +0000 Subject: [PATCH 5/8] =?UTF-8?q?fix(mcp):=20fix=20test=5Fscript=5Ftag=5Fblo?= =?UTF-8?q?cked=20=E2=80=94=20nh3=20strips=20tags,=20preserves=20inner=20t?= =?UTF-8?q?ext?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nh3.clean() removes HTML tag delimiters but preserves the text content between them, so '' becomes 'alert(1)' rather than an empty string. Update the test to assert the tag is stripped (not that a ValidationError is raised). --- .../mcp_service/chart/test_chart_schemas.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 19d06601912f..c53495b51a87 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -806,12 +806,14 @@ def test_name_with_spaces_accepted(self) -> None: col = ColumnRef(name="Total Revenue") assert col.name == "Total Revenue" - def test_script_tag_blocked(self) -> None: - """sanitize_name() blocks script-tag XSS: nh3 strips the entire script - element (tag + content) leaving an empty string, which the empty-value - guard then rejects.""" - with pytest.raises(ValidationError): - ColumnRef(name="") + def test_script_tag_stripped(self) -> None: + """sanitize_name() neutralizes script-tag XSS via nh3. nh3 removes the + tag delimiters but preserves the text content between them, so + '' becomes 'alert(1)' — no raw HTML in the + stored column name.""" + col = ColumnRef(name="") + assert col.name == "alert(1)" + assert "' varies by version: - some versions strip entire element (empty → ValidationError) - others strip only tag delimiters (preserving 'alert(1)') Accept both outcomes: no ValidationError means no ' becomes 'alert(1)' — no raw HTML in the - stored column name.""" - col = ColumnRef(name="") - assert col.name == "alert(1)" - assert "") + assert "