diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0ef8eefba4a4c..5213c5277b23f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -100,7 +100,7 @@ jobs: \"build\": \"$build\", \"pyspark\": \"$pyspark\", \"pyspark-pandas\": \"$pandas\", - \"sparkr\": \"$sparkr\", + \"sparkr\": \"false\", \"tpcds-1g\": \"$tpcds\", \"docker-integration-tests\": \"$docker\", \"scala-213\": \"$build\", @@ -436,10 +436,12 @@ jobs: with: distribution: temurin java-version: ${{ matrix.java }} - - name: List Python packages (Python 3.9, PyPy3) + - name: Install Python packages (Python 3.9) + run: | + python3.9 -m pip install -r ./dev/requirements.txt + - name: List Python packages (Python 3.9) run: | python3.9 -m pip list - pypy3 -m pip list - name: Install Conda for pip packaging test if: ${{ matrix.modules == 'pyspark-errors' }} run: | @@ -542,6 +544,7 @@ jobs: # R issues at docker environment export TZ=UTC export _R_CHECK_SYSTEM_CLOCK_=FALSE + Rscript -e "library(testthat); library(knitr); library(rmarkdown); library(markdown)" ./dev/run-tests --parallelism 1 --modules sparkr - name: Upload test results to report if: always() @@ -718,11 +721,14 @@ jobs: run: ./dev/lint-r - name: Run documentation build run: | + # SparkR is disabled in this fork's CI (see precondition above), so + # always skip the R API doc build to avoid pulling pkgdown / R toolchain + # into the docs job. + export SKIP_RDOC=1 if [ -f "./dev/is-changed.py" ]; then - # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs + # Skip PySpark docs when no PySpark module changed; keep Scala/Java/SQL docs pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi - if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi fi cd docs bundle exec jekyll build diff --git a/R/create-rd.sh b/R/create-rd.sh index 5851b622354bb..09852dd213cdd 100755 --- a/R/create-rd.sh +++ b/R/create-rd.sh @@ -34,4 +34,31 @@ pushd "$FWDIR" > /dev/null . "$FWDIR/find-r.sh" # Generate Rd files if roxygen2 is installed -"$R_SCRIPT_PATH/Rscript" -e ' if(requireNamespace("roxygen2", quietly=TRUE)) { setwd("'$FWDIR'"); roxygen2::roxygenize(package.dir="./pkg", roclets=c("rd")) }' +# +# Workaround for a roxygen2 bug where `add_s3_metadata` (called transitively +# from `topics_process_family` -> `find_object` -> `object_from_name`) tries +# to set `class(val) <- c("s3generic", "function")` on base R primitives such +# as `dim`, `nrow`, `ncol`, `ifelse`, etc. that SparkR registers S4 methods +# for. R does not allow setting attributes on builtins, so the call aborts +# with "cannot set an attribute on a 'builtin'". We override the function to +# return the primitive unchanged when class<- fails. +"$R_SCRIPT_PATH/Rscript" -e ' + if (requireNamespace("roxygen2", quietly = TRUE)) { + if (exists("add_s3_metadata", envir = asNamespace("roxygen2"), inherits = FALSE)) { + orig_add_s3_metadata <- get("add_s3_metadata", envir = asNamespace("roxygen2")) + patched_add_s3_metadata <- function(val, ...) { + tryCatch(orig_add_s3_metadata(val, ...), + error = function(e) { + if (grepl("cannot set an attribute on a .builtin.", conditionMessage(e))) { + val + } else { + stop(e) + } + }) + } + assignInNamespace("add_s3_metadata", patched_add_s3_metadata, ns = "roxygen2") + } + setwd("'$FWDIR'") + roxygen2::roxygenize(package.dir = "./pkg", roclets = c("rd")) + } +' diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index ca8f8defdfdec..2fe8817fdb388 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -550,26 +550,31 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F, ifnotfound = list(list(NULL)))[[1]] found <- sapply(funcList, function(func) { - ifelse( - identical(func, obj) && - # Also check if the parent environment is identical to current parent - identical(parent.env(environment(func)), func.env), - TRUE, FALSE) + if (!identical(func, obj)) { + return(FALSE) + } + # Primitive functions have no R-level environment; identity is enough. + if (is.primitive(func)) { + return(TRUE) + } + # Also check if the parent environment is identical to current parent + identical(parent.env(environment(func)), func.env) }) - if (sum(found) > 0) { - # If function has been examined ignore - break + if (sum(found) == 0) { + # Function has not been examined, record it and recursively clean its closure. + assign(nodeChar, + if (is.null(funcList[[1]])) { + list(obj) + } else { + append(funcList, obj) + }, + envir = checkedFuncs) + obj <- cleanClosure(obj, checkedFuncs) } - # Function has not been examined, record it and recursively clean its closure. - assign(nodeChar, - if (is.null(funcList[[1]])) { - list(obj) - } else { - append(funcList, obj) - }, - envir = checkedFuncs) - obj <- cleanClosure(obj, checkedFuncs) } + # Always include the captured object in the cleaned environment, + # even if a function with the same identity was already examined + # elsewhere (e.g. primitives like `+` shared across closures). assign(nodeChar, obj, envir = newEnv) break } @@ -593,6 +598,13 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { # a new version of func that has a correct environment (closure). cleanClosure <- function(func, checkedFuncs = new.env()) { if (is.function(func)) { + # Primitive functions (e.g. `+`, `max`, `min`) have no R-level closure + # to clean: `environment(func) <- newEnv` raises a deprecation warning + # in recent R versions ("setting environment() is + # not possible") which can be converted to an error. Return them as-is. + if (is.primitive(func)) { + return(func) + } newEnv <- new.env(parent = .GlobalEnv) func.body <- body(func) oldEnv <- environment(func) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala index 837382239514a..f00d46549ec25 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala @@ -119,6 +119,43 @@ abstract class DockerJDBCIntegrationSuite private var pulled: Boolean = false protected var jdbcUrl: String = _ + // Number of retry attempts for transient Docker registry / daemon errors + // (e.g. 5xx responses from Docker Hub, which can be flaky in CI). + private val dockerOpMaxAttempts = + sys.props.getOrElse("spark.test.docker.retryAttempts", "5").toInt + private val dockerOpInitialBackoffMs = + sys.props.getOrElse("spark.test.docker.retryInitialBackoffMs", "2000").toLong + + /** + * Retry a Docker operation that may transiently fail due to registry / daemon + * availability issues (HTTP 5xx, network glitches, etc.). Uses exponential backoff. + */ + private def retryOnDockerError[T](description: String)(op: => T): T = { + var attempt = 1 + var backoff = dockerOpInitialBackoffMs + var lastError: Throwable = null + while (attempt <= dockerOpMaxAttempts) { + try { + return op + } catch { + case NonFatal(e) => + lastError = e + if (attempt == dockerOpMaxAttempts) { + log.error( + s"Docker operation '$description' failed after $attempt attempt(s); giving up.", e) + } else { + log.warn( + s"Docker operation '$description' failed on attempt $attempt of " + + s"$dockerOpMaxAttempts; retrying in ${backoff}ms.", e) + Thread.sleep(backoff) + backoff = math.min(backoff * 2, 30000L) + } + } + attempt += 1 + } + throw lastError + } + override def beforeAll(): Unit = runIfTestsEnabled(s"Prepare for ${this.getClass.getName}") { super.beforeAll() try { @@ -140,17 +177,23 @@ abstract class DockerJDBCIntegrationSuite // Ensure that the Docker image is installed: docker.inspectImageCmd(db.imageName).exec() } catch { - case e: NotFoundException => + case _: NotFoundException => log.warn(s"Docker image ${db.imageName} not found; pulling image from registry") - docker.pullImageCmd(db.imageName) - .start() - .awaitCompletion(connectionTimeout.value.toSeconds, TimeUnit.SECONDS) + retryOnDockerError(s"pull image ${db.imageName}") { + docker.pullImageCmd(db.imageName) + .start() + .awaitCompletion(connectionTimeout.value.toSeconds, TimeUnit.SECONDS) + } pulled = true } - docker.pullImageCmd(db.imageName) - .start() - .awaitCompletion(connectionTimeout.value.toSeconds, TimeUnit.SECONDS) + // Re-pull to ensure we have the latest version of the image. The registry + // (e.g. Docker Hub) is occasionally flaky in CI with 5xx responses, so retry. + retryOnDockerError(s"pull image ${db.imageName}") { + docker.pullImageCmd(db.imageName) + .start() + .awaitCompletion(connectionTimeout.value.toSeconds, TimeUnit.SECONDS) + } val hostConfig = HostConfig .newHostConfig() diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala index 49c9e3dba0d7f..47e2c88b09e91 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala @@ -46,7 +46,7 @@ class MariaDBKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override val jdbcPort = 3306 override def getJdbcUrl(ip: String, port: Int): String = - s"jdbc:mysql://$ip:$port/mysql?user=$principal" + s"jdbc:mysql://$ip:$port/mysql?user=$principal&permitMysqlScheme" override def getEntryPoint: Option[String] = Some("/docker-entrypoint/mariadb_docker_entrypoint.sh") diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 83176aec80c53..42637942fa091 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -15,60 +15,106 @@ # limitations under the License. # -# Image for building and testing Spark branches. Based on Ubuntu 20.04. +# Image for building and testing Spark branches. Based on Ubuntu 22.04. # See also in https://hub.docker.com/_/ubuntu -FROM ubuntu:focal-20221019 +FROM ubuntu:jammy -ENV FULL_REFRESH_DATE 20221118 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +ENV FULL_REFRESH_DATE 20260420 ENV DEBIAN_FRONTEND noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN true -ARG APT_INSTALL="apt-get install --no-install-recommends -y" +ARG APT_INSTALL="apt-get install -y" -RUN apt-get clean -RUN apt-get update -RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9 -RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java +ENV PATH "$PATH:/usr/local/bin" -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN timeout 5 bash -c 'exec 3<>/dev/tcp/archive.ubuntu.com/80 && printf "HEAD /ubuntu/ HTTP/1.1\r\nHost: archive.ubuntu.com\r\nConnection: close\r\n\r\n" >&3 && IFS= read -r s <&3 && [[ "$s" =~ ^HTTP/.*[[:space:]](2|3)[0-9][0-9] ]]' || find /etc/apt -type f \( -name '*.list' -o -name '*.sources' \) -exec sed -i.bak -e 's|archive\.ubuntu\.com|mirror.fcix.net|g' -e 's|security\.ubuntu\.com|mirror.fcix.net|g' {} + +RUN apt-get clean && apt-get update +RUN PKGS="software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools build-essential gfortran libopenblas-dev liblapack-dev gpg gpg-agent software-properties-common gcc g++ make libc6-dev libffi-dev libcurl4-openssl-dev libssl-dev openssl zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev liblzma-dev tk-dev uuid-dev pandoc libuv1-dev libuv1"; $APT_INSTALL $PKGS || (apt-get update && $APT_INSTALL $PKGS) +RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -RUN add-apt-repository ppa:pypy/ppa -RUN apt update -RUN $APT_INSTALL gfortran libopenblas-dev liblapack-dev -RUN $APT_INSTALL build-essential +# We also want Python 3.8 since that's the oldest supported version for Spark 3.5 +# Also ubuntu is under a DDoS so retry adding, and finally fallback to python.org 3.8 release +RUN ( \ + (add-apt-repository -y ppa:deadsnakes/ppa || add-apt-repository -y ppa:deadsnakes/ppa) && \ + (apt-get update || apt-get update) && \ + PKGS="python3.8 python3.9 python3.9-venv python3.8-venv"; ($APT_INSTALL $PKGS || apt-get update && $APT_INSTALL $PKGS) \ + ) || \ + (PYTHON_VERSION=3.8.20; \ + curl -O https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ + tar -xzf Python-${PYTHON_VERSION}.tgz && \ + cd Python-${PYTHON_VERSION} && \ + ./configure --enable-shared --prefix=/usr/local LDFLAGS="-Wl,--rpath=/usr/local/lib" && \ + make altinstall && \ + cd .. && \ + PYTHON_VERSION=3.9.25; \ + curl -O https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ + tar -xzf Python-${PYTHON_VERSION}.tgz && \ + cd Python-${PYTHON_VERSION} && \ + ./configure --enable-shared --prefix=/usr/local LDFLAGS="-Wl,--rpath=/usr/local/lib" && \ + make altinstall) -RUN mkdir -p /usr/local/pypy/pypy3.8 && \ - curl -sqL https://downloads.python.org/pypy/pypy3.8-v7.3.11-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.8 --strip-components=1 && \ - ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \ - ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3 +RUN curl -sS https://bootstrap.pypa.io/pip/3.9/get-pip.py | python3.9 -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 +RUN curl -sS https://bootstrap.pypa.io/pip/3.8/get-pip.py | python3.8 -RUN $APT_INSTALL gnupg ca-certificates pandoc -RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list +RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 RUN gpg -a --export E084DAB9 | apt-key add - -RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' RUN apt update -RUN $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev -RUN Rscript -e "install.packages(c('remotes', 'knitr', 'markdown', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')" +RUN $APT_INSTALL r-base +RUN Rscript -e "install.packages(c('remotes'), repos='https://cloud.r-project.org/')" + +RUN Rscript -e "remotes::install_cran('testthat');" && Rscript -e "library(testthat);" +# rmarkdown bits +RUN Rscript -e "remotes::install_cran('fs');library(fs)" +RUN Rscript -e "remotes::install_cran('sass');library(sass)" + +# Install generic packages we let float + +RUN Rscript -e " \ + options(repos = c(CRAN = 'https://cloud.r-project.org/')); \ + pkgs <- c('knitr', 'markdown', 'rmarkdown', 'e1071', 'survival', 'arrow', 'xml2'); \ + remotes::install_cran(pkgs, upgrade = 'never'); \ + missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]; \ + if (length(missing)) stop('Missing R packages after install: ', paste(missing, collapse = ', ')); \ + " # See more in SPARK-39959, roxygen2 < 7.2.1 -RUN apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ - libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ - libtiff5-dev libjpeg-dev -RUN Rscript -e "install.packages(c('remotes'), repos='https://cloud.r-project.org/')" +RUN Rscript -e "remotes::install_version('pkgload', version = '1.3.2', repos = 'https://cloud.r-project.org'); \ + remotes::install_version('pkgbuild', version = '1.4.0', repos = 'https://cloud.r-project.org'); \ + remotes::install_version('desc', version = '1.4.2', repos = 'https://cloud.r-project.org'); \ + remotes::install_version('rlang', version = '1.1.1', repos = 'https://cloud.r-project.org'); \ + remotes::install_version('cli', version = '3.6.1', repos = 'https://cloud.r-project.org'); \ + remotes::install_version('purrr', version = '1.0.1', repos = 'https://cloud.r-project.org')" RUN Rscript -e "remotes::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" +# Sanity check the R install +RUN Rscript -e " \ + library(testthat); \ + library(knitr); \ + library(markdown); \ + library(rmarkdown); \ + library(roxygen2); \ + library(xml2);" + # See more in SPARK-39735 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" -RUN pypy3 -m pip install numpy 'pandas<=2.0.3' scipy coverage matplotlib -RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' +RUN python3.8 -m pip install setuptools virtualenv +RUN python3.9 -m pip install setuptools virtualenv + +RUN python3.8 -m pip install --only-binary=pandas numpy pandas 'scipy<1.9' coverage 'matplotlib==3.7.2' 'mypy==0.982' +RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' 'scipy<=1.10' unittest-xml-reporting 'plotly>=4.8' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'blinker==1.4' 'mypy==0.982' # Add Python deps for Spark Connect. RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' # Add torch as a testing dependency for TorchDistributor RUN python3.9 -m pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval + +# pyarrow +RUN python3.9 -m pip install 'pyarrow<13.0.0' +RUN python3.8 -m pip install 'pyarrow<13.0.0' diff --git a/dev/requirements.txt b/dev/requirements.txt index e3c3cae59d05d..43c7625675c83 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -3,10 +3,10 @@ py4j # PySpark dependencies (optional) numpy -pyarrow<13.0.0 -pandas +pyarrow>=4.0.0,<13.0.0 +pandas>=1.0.5,<3 scipy -plotly +plotly<6 mlflow>=2.3.1 scikit-learn matplotlib @@ -34,13 +34,10 @@ pydata_sphinx_theme ipython nbsphinx numpydoc -jinja2<3.0.0 sphinx<3.1.0 sphinx-plotly-directive sphinx-copybutton<0.5.3 docutils<0.18.0 -# See SPARK-38279. -markupsafe==2.0.1 # Development scripts jira diff --git a/python/mypy.ini b/python/mypy.ini index 3443af9a86503..ef0ee36ef8543 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -166,6 +166,19 @@ ignore_missing_imports = True [mypy-grpc.*] ignore_missing_imports = True +; pydantic is pulled in transitively (e.g. via mlflow). mypy has issues +; serializing pydantic v2's recursive JsonValue type, so skip following it. +[mypy-pydantic.*] +ignore_missing_imports = True +follow_imports = skip + +; sqlalchemy is pulled in transitively (e.g. via mlflow). mypy 0.982 hits an +; INTERNAL ERROR while analyzing sqlalchemy/engine/default.py, so skip +; following it. +[mypy-sqlalchemy.*] +ignore_missing_imports = True +follow_imports = skip + ; Ignore errors for proto generated code [mypy-pyspark.sql.connect.proto.*, pyspark.sql.connect.proto] ignore_errors = True diff --git a/python/pyspark/ml/tests/typing/test_feature.yml b/python/pyspark/ml/tests/typing/test_feature.yml index 0d1034a44df66..9c9242cf3cd48 100644 --- a/python/pyspark/ml/tests/typing/test_feature.yml +++ b/python/pyspark/ml/tests/typing/test_feature.yml @@ -47,9 +47,9 @@ out: | main:14: error: No overload variant of "StringIndexer" matches argument types "str", "List[str]" [call-overload] main:14: note: Possible overload variants: - main:14: note: def StringIndexer(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer - main:14: note: def StringIndexer(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:14: note: def __init__(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:14: note: def __init__(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer main:15: error: No overload variant of "StringIndexer" matches argument types "List[str]", "str" [call-overload] main:15: note: Possible overload variants: - main:15: note: def StringIndexer(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer - main:15: note: def StringIndexer(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:15: note: def __init__(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:15: note: def __init__(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 5c7b3e01686ae..7a23ff6b50184 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -798,13 +798,19 @@ def _new_type_holders( not isinstance(param, slice) and not isinstance(param, Iterable) for param in params ) else: - # PEP 646 changes `GenericAlias` instances into iterable ones at Python 3.11 + # PEP 646 changes `GenericAlias` instances into iterable ones at Python 3.11. + # GenericAlias is only available on Python 3.11+ and _GenericAlias is a + # private typing internal; resolve them via getattr so mypy (running under + # python_version 3.9) does not flag a missing typing attribute, and the + # 3.11+ runtime still sees the real classes. + _typing_generic_alias: type = getattr(typing, "GenericAlias", type(None)) + _typing_private_generic_alias: type = getattr(typing, "_GenericAlias", type(None)) is_unnamed_params = all( not isinstance(param, slice) and ( not isinstance(param, Iterable) - or isinstance(param, typing.GenericAlias) - or isinstance(param, typing._GenericAlias) + or isinstance(param, _typing_generic_alias) + or isinstance(param, _typing_private_generic_alias) ) for param in params ) diff --git a/python/pyspark/sql/tests/typing/test_functions.yml b/python/pyspark/sql/tests/typing/test_functions.yml index 6c80420bf0a3b..c540f508b39dd 100644 --- a/python/pyspark/sql/tests/typing/test_functions.yml +++ b/python/pyspark/sql/tests/typing/test_functions.yml @@ -70,32 +70,32 @@ main:29: error: No overload variant of "array" matches argument types "List[Column]", "List[Column]" [call-overload] main:29: note: Possible overload variants: main:29: note: def array(*cols: Union[Column, str]) -> Column - main:29: note: def [ColumnOrName_] array(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:29: note: def [ColumnOrName_] array(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:30: error: No overload variant of "create_map" matches argument types "List[Column]", "List[Column]" [call-overload] main:30: note: Possible overload variants: main:30: note: def create_map(*cols: Union[Column, str]) -> Column - main:30: note: def [ColumnOrName_] create_map(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:30: note: def [ColumnOrName_] create_map(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:31: error: No overload variant of "map_concat" matches argument types "List[Column]", "List[Column]" [call-overload] main:31: note: Possible overload variants: main:31: note: def map_concat(*cols: Union[Column, str]) -> Column - main:31: note: def [ColumnOrName_] map_concat(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:31: note: def [ColumnOrName_] map_concat(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:32: error: No overload variant of "struct" matches argument types "List[str]", "List[str]" [call-overload] main:32: note: Possible overload variants: main:32: note: def struct(*cols: Union[Column, str]) -> Column - main:32: note: def [ColumnOrName_] struct(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:32: note: def [ColumnOrName_] struct(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:33: error: No overload variant of "array" matches argument types "List[str]", "List[str]" [call-overload] main:33: note: Possible overload variants: main:33: note: def array(*cols: Union[Column, str]) -> Column - main:33: note: def [ColumnOrName_] array(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:33: note: def [ColumnOrName_] array(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:34: error: No overload variant of "create_map" matches argument types "List[str]", "List[str]" [call-overload] main:34: note: Possible overload variants: main:34: note: def create_map(*cols: Union[Column, str]) -> Column - main:34: note: def [ColumnOrName_] create_map(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:34: note: def [ColumnOrName_] create_map(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:35: error: No overload variant of "map_concat" matches argument types "List[str]", "List[str]" [call-overload] main:35: note: Possible overload variants: main:35: note: def map_concat(*cols: Union[Column, str]) -> Column - main:35: note: def [ColumnOrName_] map_concat(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:35: note: def [ColumnOrName_] map_concat(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:36: error: No overload variant of "struct" matches argument types "List[str]", "List[str]" [call-overload] main:36: note: Possible overload variants: main:36: note: def struct(*cols: Union[Column, str]) -> Column - main:36: note: def [ColumnOrName_] struct(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:36: note: def [ColumnOrName_] struct(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 04f430a259c00..bdeb21951e2e1 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1205,8 +1205,8 @@ def fromJson(cls, json: Dict[str, Any]) -> "UserDefinedType": m = __import__(pyModule, globals(), locals(), [pyClass]) if not hasattr(m, pyClass): raise PySparkValueError( - errorClass="UNSUPPORTED_OPERATION", - messageParameters={"operation": "unpickling user defined types"}, + error_class="UNSUPPORTED_OPERATION", + message_parameters={"operation": "unpickling user defined types"}, ) else: UDT = getattr(m, pyClass) diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 02b67d96f1701..3915ab83bc4e1 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -50,7 +50,7 @@ from pyspark.sql.window import Window from pyspark.pandas._typing import IndexOpsLike, SeriesOrIndex -has_numpy = False +has_numpy: bool = False try: import numpy as np # noqa: F401