Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -632,20 +632,12 @@ jobs:
$py -m pip list
echo ""
done
- name: Install Conda for pip packaging test
if: contains(matrix.modules, 'pyspark-errors')
uses: conda-incubator/setup-miniconda@v3
with:
miniforge-version: latest
activate-environment: ""
auto-activate: false
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
shell: 'script -q -e -c "bash {0}"'
run: |
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
export PATH=$CONDA/bin:$PATH
export SKIP_PACKAGING=false
echo "Python Packaging Tests Enabled!"
fi
Expand Down
128 changes: 50 additions & 78 deletions dev/run-pip-tests
Original file line number Diff line number Diff line change
Expand Up @@ -43,32 +43,22 @@ function delete_virtualenv() {
}
trap delete_virtualenv EXIT

PYTHON_EXECS=()
# Some systems don't have pip or virtualenv - in those cases our tests won't work.
if hash virtualenv 2>/dev/null && [ ! -n "$USE_CONDA" ]; then
echo "virtualenv installed - using. Note if this is a conda virtual env you may wish to set USE_CONDA"
# test only against python3
if hash python3 2>/dev/null; then
PYTHON_EXECS=('python3')
else
echo "Python3 not installed on system, skipping pip installability tests"
exit 0
fi
elif hash conda 2>/dev/null; then
echo "Using conda virtual environments"
PYTHON_EXECS=('3.10')
USE_CONDA=1

if [ -z "${PYTHON_TO_TEST}" ]; then
PYTHON_EXECUTABLE="python3"
else
echo "Missing virtualenv & conda, skipping pip installability tests"
exit 0
PYTHON_EXECUTABLE="${PYTHON_TO_TEST}"
fi
if ! hash pip 2>/dev/null; then
echo "Missing pip, skipping pip installability tests."

if ! hash "$PYTHON_EXECUTABLE" 2>/dev/null; then
echo "Python executable $PYTHON_EXECUTABLE not installed on system, skipping pip installability tests"
exit 0
fi

echo "Using Python executable: $PYTHON_EXECUTABLE"

# Determine which version of PySpark we are building for archive name
PYSPARK_VERSION=$(python3 -c "exec(open('python/pyspark/version.py').read());print(__version__)")
PYSPARK_VERSION=$($PYTHON_EXECUTABLE -c "exec(open('python/pyspark/version.py').read());print(__version__)")
PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
# The pip install options we use for all the pip commands
PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall --use-pep517"
Expand All @@ -80,64 +70,46 @@ PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
# In this test, explicitly exclude user sitepackages to prevent side effects
export PYTHONNOUSERSITE=1

for python in "${PYTHON_EXECS[@]}"; do
for install_command in "${PIP_COMMANDS[@]}"; do
echo "Testing pip installation with python $python"
# Create a temp directory for us to work in and save its name to a file for cleanup
echo "Using $VIRTUALENV_BASE for virtualenv"
VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
rm -rf "$VIRTUALENV_PATH"
if [ -n "$USE_CONDA" ]; then
conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools
source activate "$VIRTUALENV_PATH" || conda activate "$VIRTUALENV_PATH"
else
mkdir -p "$VIRTUALENV_PATH"
virtualenv --python=$python "$VIRTUALENV_PATH"
source "$VIRTUALENV_PATH"/bin/activate
fi
# Upgrade pip & friends if using virtual env
if [ ! -n "$USE_CONDA" ]; then
pip install --upgrade pip wheel numpy
fi

echo "Creating pip installable source dist"
cd "$FWDIR"/python
# Delete the egg info file if it exists, this can cache the setup file.
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
python3 packaging/classic/setup.py sdist


echo "Installing dist into virtual env"
cd dist
# Verify that the dist directory only contains one thing to install
sdists=(*.tar.gz)
if [ ${#sdists[@]} -ne 1 ]; then
echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
exit -1
fi
# Do the actual installation
cd "$FWDIR"
$install_command

cd /

echo "Run basic sanity check on pip installed version with spark-submit"
spark-submit "$FWDIR"/dev/pip-sanity-check.py
echo "Run basic sanity check with import based"
python3 "$FWDIR"/dev/pip-sanity-check.py
echo "Run the tests for context.py"
python3 "$FWDIR"/python/pyspark/core/context.py

cd "$FWDIR"

# conda / virtualenv environments need to be deactivated differently
if [ -n "$USE_CONDA" ]; then
source deactivate || conda deactivate
else
deactivate
fi

done
for install_command in "${PIP_COMMANDS[@]}"; do
# Create a temp directory for us to work in and save its name to a file for cleanup
echo "Using $VIRTUALENV_BASE for virtualenv"
VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
rm -rf "$VIRTUALENV_PATH"
$PYTHON_EXECUTABLE -m venv "$VIRTUALENV_PATH"
source "$VIRTUALENV_PATH"/bin/activate
pip install --upgrade pip wheel numpy setuptools

echo "Creating pip installable source dist"
cd "$FWDIR"/python
# Delete the egg info file if it exists, this can cache the setup file.
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
python3 packaging/classic/setup.py sdist

echo "Installing dist into virtual env"
cd dist
# Verify that the dist directory only contains one thing to install
sdists=(*.tar.gz)
if [ ${#sdists[@]} -ne 1 ]; then
echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
exit -1
fi
# Do the actual installation
cd "$FWDIR"
$install_command

cd /

echo "Run basic sanity check on pip installed version with spark-submit"
spark-submit "$FWDIR"/dev/pip-sanity-check.py
echo "Run basic sanity check with import based"
python3 "$FWDIR"/dev/pip-sanity-check.py
echo "Run the tests for context.py"
python3 "$FWDIR"/python/pyspark/core/context.py

cd "$FWDIR"

deactivate

done

exit 0
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-310/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -46,6 +46,7 @@ RUN apt-get update && apt-get install -y \
openjdk-17-jdk-headless \
pkg-config \
python3.10 \
python3.10-venv \
python3-psutil \
qpdf \
tzdata \
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-311/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,6 +50,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.11 \
python3.11-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-312-classic-only/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark Cl
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,6 +50,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.12 \
python3.12-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-312-pandas-3/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260127
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -53,6 +53,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.12 \
python3.12-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-312/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,6 +50,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.12 \
python3.12-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-313/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,6 +50,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.13 \
python3.13-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-314-nogil/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,6 +50,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.14-nogil \
python3.14-venv \
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems we cannot install packages in this way, deadsnakes doesn't support such package.

And for venv, do we need to additionally install it? It should be available after install python

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh? Could you post the failure? It's only not supported in 3.14 or 3.14t? That's how I install venv.

So venv, like many other "standard libraries", is actually optional to Python package. How to package it is determined by the distribution. On Linux I believe venv is not always packaged with python.

We can use uv, but this should in theory work. deadsnake should have it.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/zhengruifeng/spark/actions/runs/21749252022/job/62742834544

it acutually failed with python3.14-pip

When upgrading the os to ubuntu 24, it seems the venv is already available after intsalling python3.14

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, okay. In that case I'll remove this line.

&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-314/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260203
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -50,6 +50,7 @@ RUN apt-get update && apt-get install -y \
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.14 \
python3.14-venv \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-minimum/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark wi
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260127
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -44,6 +44,7 @@ RUN apt-get update && apt-get install -y \
openjdk-17-jdk-headless \
pkg-config \
python3.10 \
python3.10-venv \
python3-psutil \
tzdata \
software-properties-common \
Expand Down
3 changes: 2 additions & 1 deletion dev/spark-test-image/python-ps-minimum/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For Pandas API
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20260127
ENV FULL_REFRESH_DATE=20260204

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -44,6 +44,7 @@ RUN apt-get update && apt-get install -y \
openjdk-17-jdk-headless \
pkg-config \
python3.10 \
python3.10-venv \
python3-psutil \
tzdata \
software-properties-common \
Expand Down