diff --git a/doc/user/content/sql/create-source/mysql.md b/doc/user/content/sql/create-source/mysql.md index 248206b91c2ca..93ce730ad7b0c 100644 --- a/doc/user/content/sql/create-source/mysql.md +++ b/doc/user/content/sql/create-source/mysql.md @@ -294,7 +294,10 @@ CREATE SOURCE mz_source If you're replicating tables that use [data types unsupported](#supported-types) by Materialize, use the `TEXT COLUMNS` option to decode data as `text` for the -affected columns. This option expects the upstream fully-qualified names of the +affected columns. `TEXT COLUMNS` should also be used for columns that contain +MySQL zero-value `DATE`, `DATETIME`, or `TIMESTAMP` data. + +This option expects the upstream fully-qualified names of the replicated table and column (i.e. as defined in your MySQL database). ```mzsql diff --git a/doc/user/data/mysql_source_details.yml b/doc/user/data/mysql_source_details.yml index 98db0fb56975d..295b63bfb836c 100644 --- a/doc/user/data/mysql_source_details.yml +++ b/doc/user/data/mysql_source_details.yml @@ -67,6 +67,20 @@ - Use the [`EXCLUDE COLUMNS`](/sql/create-source/mysql/#excluding-columns) option to exclude any columns that contain unsupported data types. + #### Zero values for `date`, `datetime`, and `timestamp` + + MySQL allows the special "zero" values `0000-00-00`, `0000-00-00 + 00:00:00` in `date`, `datetime`, and `timestamp` columns when the server + `sql_mode` does not include `NO_ZERO_DATE` or `NO_ZERO_IN_DATE`. These + values are not representable in Materialize's corresponding native types, + so they will cause ingestion to fail for the affected column. + + To ingest columns that contain zero values, use [`TEXT + COLUMNS`](/sql/create-source/mysql/#handling-unsupported-types) to + decode the affected columns as `text`. The zero values for `date`, + `datetime`, `timestamp`, and `year` are preserved verbatim as strings + (e.g. `"0000-00-00 00:00:00"`, `"0000"`). + - name: mysql-truncation-restriction content: | Avoid truncating upstream tables that are being replicated into Materialize. diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index 53b78be10b65b..926c2568b2199 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -361,8 +361,16 @@ fn pack_val_as_datum( } } Some(MySqlColumnMeta::Year) => { - let val = from_value_opt::(value)?; - packer.push(Datum::String(&val.to_string())); + let mut val = from_value_opt::(value)?; + // mysql_common incorrectly handles MySQL YEAR type, which has a valid range + // of 1901-2155 (https://dev.mysql.com/doc/refman/8.0/en/year.html) + // + // We treat the value 1900 as the zero-value year - "0000" + // https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L124-L129 + if val == 1900 { + val = 0; + } + packer.push(Datum::String(&format!("{val:04}"))); } Some(MySqlColumnMeta::Date) => { // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so diff --git a/test/mysql-cdc/30-text-columns.td b/test/mysql-cdc/30-text-columns.td index 0b8e438fed6d7..28acf4bea53f7 100644 --- a/test/mysql-cdc/30-text-columns.td +++ b/test/mysql-cdc/30-text-columns.td @@ -24,7 +24,10 @@ $ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-pass # Insert data into MySQL that can't be decoded using native types and must be decoded # as a TEXT COLUMN. DATE-type coverage lives in text-columns-date.td; -# TIMESTAMP/DATETIME coverage lives in text-columns-timestamp.td. +# TIMESTAMP/DATETIME coverage lives in text-columns-timestamp.td; +# YEAR coverage (including the zero-year sentinel) lives in text-columns-year.td. +# The YEAR usage retained below is intentional, as part of the multi-column +# TEXT COLUMNS integration check (combined-clause SHOW CREATE TABLE rewrite). $ mysql-execute name=mysql DROP DATABASE IF EXISTS public; diff --git a/test/mysql-cdc/text-columns-date.td b/test/mysql-cdc/text-columns-date.td index 4bf5a8eed392c..df1e8537f5208 100644 --- a/test/mysql-cdc/text-columns-date.td +++ b/test/mysql-cdc/text-columns-date.td @@ -56,6 +56,18 @@ COMMIT; WITH (TEXT COLUMNS = (event_date)); > COMMIT +# Block until the snapshot is fully ingested before issuing the +# post-snapshot inserts, so those rows go through the binlog decode +# path rather than being absorbed into the snapshot. +> SELECT id, event_date FROM events ORDER BY id; +1 "2024-04-03" +2 "0000-00-00" +3 +4 "1000-01-01" +5 "9999-12-31" +11 "2024-00-01" +12 "2024-01-00" + # Post-snapshot rows exercise the replication / binlog decode path. $ mysql-execute name=mysql USE public; @@ -136,6 +148,9 @@ INSERT INTO reports VALUES (1, '2024-04-03'); > CREATE TABLE reports FROM SOURCE da (REFERENCE public.reports); > COMMIT +# Block until the snapshot lands before issuing the binlog-path insert, +# so the zero-date below is decoded as a replication event, not a +# snapshot row. > SELECT * FROM reports; 1 "2024-04-03" diff --git a/test/mysql-cdc/text-columns-timestamp.td b/test/mysql-cdc/text-columns-timestamp.td index c97d9ab9d3a79..ba566179bc55f 100644 --- a/test/mysql-cdc/text-columns-timestamp.td +++ b/test/mysql-cdc/text-columns-timestamp.td @@ -56,6 +56,16 @@ COMMIT; WITH (TEXT COLUMNS = (created_at, updated_at, archived_at, born_at, mid_at)); > COMMIT +# Block until the snapshot is fully ingested before issuing the +# post-snapshot inserts, so those rows go through the binlog decode +# path rather than being absorbed into the snapshot. +> SELECT id, created_at, updated_at, archived_at, born_at, mid_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13" "2024-04-03 10:15:13.123456" "2024-04-03 10:15:13" "2024-04-03 10:15:13.123456" "2024-04-03 10:15:13.1234" +2 "0000-00-00 00:00:00" "0000-00-00 00:00:00.000000" "0000-00-00 00:00:00" "0000-00-00 00:00:00.000000" "0000-00-00 00:00:00.0000" +3 +7 "1001-01-01 00:00:00" "1001-01-01 00:00:00.000001" "1001-01-01 00:00:00.0001" +8 "9999-12-31 23:59:59" "9999-12-31 23:59:59.999999" "9999-12-31 23:59:59.9999" + # Post-snapshot rows exercise the replication / binlog decode path, # which uses a different mysql_common::Value variant than the snapshot. $ mysql-execute name=mysql diff --git a/test/mysql-cdc/text-columns-year.td b/test/mysql-cdc/text-columns-year.td new file mode 100644 index 0000000000000..3ef6435784b8c --- /dev/null +++ b/test/mysql-cdc/text-columns-year.td @@ -0,0 +1,120 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# +# Regression test for MySQL YEAR columns ingested via TEXT COLUMNS, +# with a mix of valid values and the zero-year sentinel. +# +# YEAR is one of the MySQL types that cannot be ingested natively in +# Materialize (see schemas.rs: YEAR is only mapped in parse_as_text_column, +# not in the native parser). CREATE TABLE FROM SOURCE on a YEAR column +# without TEXT COLUMNS errors with "unsupported type"; declaring the column +# in TEXT COLUMNS is the documented workaround. +# +# Per https://dev.mysql.com/doc/refman/8.0/en/year.html: +# * YEAR range: 1901 to 2155 +# * Zero value: 0000 (allowed when sql_mode lacks NO_ZERO_DATE) +# +# YEAR values are rendered zero-padded to four digits, matching the +# literal MySQL form and the DATE/TIMESTAMP zero-value convention +# ("0000-00-00", "0000-00-00 00:00:00"). The binlog decode path +# additionally remaps mysql_common's 1900-on-the-wire representation +# of the zero-year back to 0; the snapshot row (id 2) and the binlog +# row (id 7) below pin both paths. + +> CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' + +> CREATE CONNECTION mysqc TO MYSQL ( + HOST mysql, + USER root, + PASSWORD SECRET mysqlpass + ) + +$ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} + +# sql_mode = '' is required so MySQL accepts the zero-year that motivates +# the use of TEXT COLUMNS in the first place. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE events (id INT PRIMARY KEY, event_year YEAR NULL); +START TRANSACTION; +INSERT INTO events VALUES (1, '2024'), (2, '0000'), (3, NULL); +# Boundary rows: min and max valid YEAR values. +INSERT INTO events VALUES (4, '1901'), (5, '2155'); +COMMIT; + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE events FROM SOURCE da (REFERENCE public.events) + WITH (TEXT COLUMNS = (event_year)); +> COMMIT + +> SELECT id, event_year FROM events ORDER BY id; +1 "2024" +2 "0000" +3 +4 "1901" +5 "2155" + +# Post-snapshot rows exercise the replication / binlog decode path. +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +START TRANSACTION; +INSERT INTO events VALUES (6, '2025'), (7, 0), (8, NULL); +INSERT INTO events VALUES (9, '1901'), (10, '2155'); +COMMIT; + +> SELECT id, event_year FROM events ORDER BY id; +1 "2024" +2 "0000" +3 +4 "1901" +5 "2155" +6 "2025" +7 "0000" +8 +9 "1901" +10 "2155" + +# Verify the column type was rewritten to text by TEXT COLUMNS. +> SELECT pg_typeof(event_year) FROM events LIMIT 1; +text + +# None of the data above should have caused the source to go into a stalled state. +> SELECT name, status, error IS NULL FROM mz_internal.mz_source_statuses WHERE name IN ('da', 'events') ORDER BY name; +da running true +events running true + +> DROP SOURCE da CASCADE; + +# +# Negative path: a YEAR column that is NOT declared as a TEXT COLUMN cannot be +# ingested. YEAR has no native mapping in Materialize, so CREATE TABLE FROM +# SOURCE must error. This guards the documented workaround: declare YEAR +# columns in TEXT COLUMNS. +# + +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +CREATE TABLE reports (id INT PRIMARY KEY, reported_year YEAR NULL); +INSERT INTO reports VALUES (1, '2024'); + +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +! CREATE TABLE reports FROM SOURCE da (REFERENCE public.reports); +contains: unsupported type + +> DROP SOURCE da CASCADE;