Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ on:
- 'dev/spark-test-image/python-309/Dockerfile'
- 'dev/spark-test-image/python-310/Dockerfile'
- 'dev/spark-test-image/python-311/Dockerfile'
- 'dev/spark-test-image/python-311-classic-only/Dockerfile'
- 'dev/spark-test-image/python-312/Dockerfile'
- 'dev/spark-test-image/python-313/Dockerfile'
- 'dev/spark-test-image/python-313-nogil/Dockerfile'
Expand Down Expand Up @@ -191,6 +192,19 @@ jobs:
- name: Image digest (PySpark with Python 3.11)
if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_311.outputs.digest }}
- name: Build and push (PySpark Classic Only with Python 3.11)
if: hashFiles('dev/spark-test-image/python-311-classic-only/Dockerfile') != ''
id: docker_build_pyspark_python_311_classic_only
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-311-classic-only/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-classic-only-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-classic-only-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-311-classic-only-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark Classic Only with Python 3.11)
if: hashFiles('dev/spark-test-image/python-311-classic-only/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_311_classic_only.outputs.digest }}
- name: Build and push (PySpark with Python 3.12)
if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != ''
id: docker_build_pyspark_python_312
Expand Down
47 changes: 47 additions & 0 deletions .github/workflows/build_python_3.11_classic_only.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: "Build / Python-only Classic-only (master, Python 3.11)"

on:
schedule:
- cron: '0 0 */3 * *'
workflow_dispatch:

jobs:
run-build:
permissions:
packages: write
name: Run
uses: ./.github/workflows/build_and_test.yml
if: github.repository == 'apache/spark'
with:
java: 17
branch: master
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-311-classic-only",
"PYTHON_TO_TEST": "python3.11"
}
jobs: >-
{
"pyspark": "true",
"pyspark-pandas": "true"
}
79 changes: 79 additions & 0 deletions dev/spark-test-image/python-311-classic-only/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Image for building and testing Spark branches. Based on Ubuntu 22.04.
# See also in https://hub.docker.com/_/ubuntu
FROM ubuntu:jammy-20240911.1
LABEL org.opencontainers.image.authors="Apache Spark project <dev@spark.apache.org>"
LABEL org.opencontainers.image.licenses="Apache-2.0"
LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark Classic with Python 3.11"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20250424

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true

RUN apt-get update && apt-get install -y \
build-essential \
ca-certificates \
curl \
gfortran \
git \
gnupg \
libcurl4-openssl-dev \
libfontconfig1-dev \
libfreetype6-dev \
libfribidi-dev \
libgit2-dev \
libharfbuzz-dev \
libjpeg-dev \
liblapack-dev \
libopenblas-dev \
libpng-dev \
libpython3-dev \
libssl-dev \
libtiff5-dev \
libxml2-dev \
openjdk-17-jdk-headless \
pkg-config \
qpdf \
tzdata \
software-properties-common \
wget \
zlib1g-dev

# Install Python 3.11
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.11 \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*


ARG BASIC_PIP_PKGS="numpy pyarrow>=19.0.0 pandas==2.2.3 plotly<6.0.0 matplotlib openpyxl memory-profiler>=0.61.0 mlflow>=2.8.1 scipy scikit-learn>=1.3.2"
ARG TEST_PIP_PKGS="coverage unittest-xml-reporting"

# Install Python 3.11 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
RUN python3.11 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this
RUN python3.11 -m pip install $BASIC_PIP_PKGS $TEST_PIP_PKGS && \
python3.11 -m pip install 'torch<2.6.0' torchvision --index-url https://download.pytorch.org/whl/cpu && \
python3.11 -m pip install deepspeed torcheval && \
python3.11 -m pip cache purge
21 changes: 6 additions & 15 deletions python/pyspark/ml/connect/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,25 +58,16 @@ def _test() -> None:
print("Not supported in no-GIL mode", file=sys.stderr)
sys.exit(0)

from pyspark.testing import should_test_connect

if not should_test_connect:
print(f"Skipping pyspark.ml.functions doctests", file=sys.stderr)
sys.exit(0)

import doctest
from pyspark.sql import SparkSession as PySparkSession
import pyspark.ml.connect.functions

from pyspark.sql.pandas.utils import (
require_minimum_pandas_version,
require_minimum_pyarrow_version,
)

try:
require_minimum_pandas_version()
require_minimum_pyarrow_version()
except Exception as e:
print(
f"Skipping pyspark.ml.functions doctests: {e}",
file=sys.stderr,
)
sys.exit(0)

globs = pyspark.ml.connect.functions.__dict__.copy()

globs["spark"] = (
Expand Down
6 changes: 6 additions & 0 deletions python/pyspark/sql/connect/tvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@ def _test() -> None:
print("Not supported in no-GIL mode", file=sys.stderr)
sys.exit(0)

from pyspark.testing import should_test_connect

if not should_test_connect:
print(f"Skipping pyspark.ml.functions doctests", file=sys.stderr)
sys.exit(0)

import doctest
from pyspark.sql import SparkSession as PySparkSession
import pyspark.sql.connect.tvf
Expand Down