Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -639,20 +639,12 @@ jobs:
$py -m pip list
echo ""
done
- name: Install Conda for pip packaging test
if: contains(matrix.modules, 'pyspark-errors')
uses: conda-incubator/setup-miniconda@v3
with:
miniforge-version: latest
activate-environment: ""
auto-activate: false
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
shell: 'script -q -e -c "bash {0}"'
run: |
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
export PATH=$CONDA/bin:$PATH
export SKIP_PACKAGING=false
echo "Python Packaging Tests Enabled!"
fi
Expand Down
128 changes: 50 additions & 78 deletions dev/run-pip-tests
Original file line number Diff line number Diff line change
Expand Up @@ -43,32 +43,22 @@ function delete_virtualenv() {
}
trap delete_virtualenv EXIT

PYTHON_EXECS=()
# Some systems don't have pip or virtualenv - in those cases our tests won't work.
if hash virtualenv 2>/dev/null && [ ! -n "$USE_CONDA" ]; then
echo "virtualenv installed - using. Note if this is a conda virtual env you may wish to set USE_CONDA"
# test only against python3
if hash python3 2>/dev/null; then
PYTHON_EXECS=('python3')
else
echo "Python3 not installed on system, skipping pip installability tests"
exit 0
fi
elif hash conda 2>/dev/null; then
echo "Using conda virtual environments"
PYTHON_EXECS=('3.10')
USE_CONDA=1

if [ -z "${PYTHON_TO_TEST}" ]; then
PYTHON_EXECUTABLE="python3"
else
echo "Missing virtualenv & conda, skipping pip installability tests"
exit 0
PYTHON_EXECUTABLE="${PYTHON_TO_TEST}"
fi
if ! hash pip 2>/dev/null; then
echo "Missing pip, skipping pip installability tests."

if ! hash "$PYTHON_EXECUTABLE" 2>/dev/null; then
echo "Python executable $PYTHON_EXECUTABLE not installed on system, skipping pip installability tests"
exit 0
fi

echo "Using Python executable: $PYTHON_EXECUTABLE"

# Determine which version of PySpark we are building for archive name
PYSPARK_VERSION=$(python3 -c "exec(open('python/pyspark/version.py').read());print(__version__)")
PYSPARK_VERSION=$($PYTHON_EXECUTABLE -c "exec(open('python/pyspark/version.py').read());print(__version__)")
PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
# The pip install options we use for all the pip commands
PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall --use-pep517"
Expand All @@ -80,64 +70,46 @@ PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
# In this test, explicitly exclude user sitepackages to prevent side effects
export PYTHONNOUSERSITE=1

for python in "${PYTHON_EXECS[@]}"; do
for install_command in "${PIP_COMMANDS[@]}"; do
echo "Testing pip installation with python $python"
# Create a temp directory for us to work in and save its name to a file for cleanup
echo "Using $VIRTUALENV_BASE for virtualenv"
VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
rm -rf "$VIRTUALENV_PATH"
if [ -n "$USE_CONDA" ]; then
conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools
source activate "$VIRTUALENV_PATH" || conda activate "$VIRTUALENV_PATH"
else
mkdir -p "$VIRTUALENV_PATH"
virtualenv --python=$python "$VIRTUALENV_PATH"
source "$VIRTUALENV_PATH"/bin/activate
fi
# Upgrade pip & friends if using virtual env
if [ ! -n "$USE_CONDA" ]; then
pip install --upgrade pip wheel numpy
fi

echo "Creating pip installable source dist"
cd "$FWDIR"/python
# Delete the egg info file if it exists, this can cache the setup file.
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
python3 packaging/classic/setup.py sdist


echo "Installing dist into virtual env"
cd dist
# Verify that the dist directory only contains one thing to install
sdists=(*.tar.gz)
if [ ${#sdists[@]} -ne 1 ]; then
echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
exit -1
fi
# Do the actual installation
cd "$FWDIR"
$install_command

cd /

echo "Run basic sanity check on pip installed version with spark-submit"
spark-submit "$FWDIR"/dev/pip-sanity-check.py
echo "Run basic sanity check with import based"
python3 "$FWDIR"/dev/pip-sanity-check.py
echo "Run the tests for context.py"
python3 "$FWDIR"/python/pyspark/core/context.py

cd "$FWDIR"

# conda / virtualenv environments need to be deactivated differently
if [ -n "$USE_CONDA" ]; then
source deactivate || conda deactivate
else
deactivate
fi

done
for install_command in "${PIP_COMMANDS[@]}"; do
# Create a temp directory for us to work in and save its name to a file for cleanup
echo "Using $VIRTUALENV_BASE for virtualenv"
VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
rm -rf "$VIRTUALENV_PATH"
$PYTHON_EXECUTABLE -m venv "$VIRTUALENV_PATH"
source "$VIRTUALENV_PATH"/bin/activate
pip install --upgrade pip wheel numpy setuptools

echo "Creating pip installable source dist"
cd "$FWDIR"/python
# Delete the egg info file if it exists, this can cache the setup file.
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
python3 packaging/classic/setup.py sdist

echo "Installing dist into virtual env"
cd dist
# Verify that the dist directory only contains one thing to install
sdists=(*.tar.gz)
if [ ${#sdists[@]} -ne 1 ]; then
echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
exit -1
fi
# Do the actual installation
cd "$FWDIR"
$install_command

cd /

echo "Run basic sanity check on pip installed version with spark-submit"
spark-submit "$FWDIR"/dev/pip-sanity-check.py
echo "Run basic sanity check with import based"
python3 "$FWDIR"/dev/pip-sanity-check.py
echo "Run the tests for context.py"
python3 "$FWDIR"/python/pyspark/core/context.py

cd "$FWDIR"

deactivate

done

exit 0
6 changes: 3 additions & 3 deletions dev/spark-test-image/lint/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for Linter"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260208
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand Down Expand Up @@ -52,6 +52,7 @@ RUN apt-get update && apt-get install -y \
npm \
pkg-config \
python3.12 \
python3.12-venv \
qpdf \
tzdata \
r-base \
Expand All @@ -72,10 +73,9 @@ ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.12 -m venv --without-pip $VIRTUAL_ENV
RUN python3.12 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
RUN python3.12 -m pip install \
'black==23.12.1' \
'flake8==3.9.0' \
Expand Down
8 changes: 3 additions & 5 deletions dev/spark-test-image/python-310/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260206
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand Down Expand Up @@ -53,18 +53,16 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.10 -m venv --without-pip $VIRTUAL_ENV
RUN python3.10 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.10 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas==2.3.3 scipy plotly<6.0.0 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2 pystack>=1.6.0 psutil"
ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3"

Expand Down
8 changes: 3 additions & 5 deletions dev/spark-test-image/python-311/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260206
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,18 +50,16 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.11 \
python3.11-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.11 -m venv --without-pip $VIRTUAL_ENV
RUN python3.11 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.11 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas==2.3.3 scipy plotly<6.0.0 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2 pystack>=1.6.0 psutil"
ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3"

Expand Down
8 changes: 3 additions & 5 deletions dev/spark-test-image/python-312-classic-only/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark Cl
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260207
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -42,6 +42,7 @@ RUN apt-get update && apt-get install -y \
libssl-dev \
openjdk-17-jdk-headless \
python3.12 \
python3.12-venv \
pkg-config \
tzdata \
software-properties-common \
Expand All @@ -52,12 +53,9 @@ RUN apt-get update && apt-get install -y \

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.12 -m venv --without-pip $VIRTUAL_ENV
RUN python3.12 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.12 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 pandas==2.3.3 plotly<6.0.0 matplotlib openpyxl memory-profiler>=0.61.0 mlflow>=2.8.1 scipy scikit-learn>=1.3.2 pystack>=1.6.0 psutil"
ARG TEST_PIP_PKGS="coverage unittest-xml-reporting"

Expand Down
9 changes: 3 additions & 6 deletions dev/spark-test-image/python-312-pandas-3/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260207
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -45,6 +45,7 @@ RUN apt-get update && apt-get install -y \
libssl-dev \
openjdk-17-jdk-headless \
python3.12 \
python3.12-venv \
pkg-config \
tzdata \
software-properties-common \
Expand All @@ -55,13 +56,9 @@ RUN apt-get update && apt-get install -y \

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.12 -m venv --without-pip $VIRTUAL_ENV
RUN python3.12 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.12 packages
# Note that mlflow is execluded since it requires pandas<3
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas>=3 scipy plotly<6.0.0 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3"

Expand Down
8 changes: 3 additions & 5 deletions dev/spark-test-image/python-312/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260204
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -42,6 +42,7 @@ RUN apt-get update && apt-get install -y \
libssl-dev \
openjdk-17-jdk-headless \
python3.12 \
python3.12-venv \
pkg-config \
tzdata \
software-properties-common \
Expand All @@ -52,12 +53,9 @@ RUN apt-get update && apt-get install -y \

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.12 -m venv --without-pip $VIRTUAL_ENV
RUN python3.12 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.12 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas==2.3.3 scipy plotly<6.0.0 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2 pystack>=1.6.0 psutil"
ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3"

Expand Down
8 changes: 3 additions & 5 deletions dev/spark-test-image/python-313/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260206
ENV FULL_REFRESH_DATE=20260210

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,18 +50,16 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.13 \
python3.13-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Setup virtual environment
ENV VIRTUAL_ENV=/opt/spark-venv
RUN python3.13 -m venv --without-pip $VIRTUAL_ENV
RUN python3.13 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# Install Python 3.13 packages
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.13

ARG BASIC_PIP_PKGS="numpy pyarrow>=22.0.0 six==1.16.0 pandas==2.3.3 scipy plotly<6.0.0 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2 pystack>=1.6.0 psutil"
ARG CONNECT_PIP_PKGS="grpcio==1.76.0 grpcio-status==1.76.0 protobuf==6.33.5 googleapis-common-protos==1.71.0 zstandard==0.25.0 graphviz==0.20.3"

Expand Down
Loading
Loading