Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9506a8f
draft structure of DataDriftAnalyzer
alelavml3 Jun 24, 2025
d1e5ef0
first ks implementation
alelavml3 Jun 24, 2025
028c872
ks after test
alelavml3 Jun 24, 2025
a026027
just test rule accept testWorkers parameter
alelavml3 Jun 24, 2025
3d5106e
new drift info
alelavml3 Jun 24, 2025
ea30f43
testing for analyzer
alelavml3 Jun 25, 2025
70cb647
formatting with bonferroni, new monitoring specs
alelavml3 Jun 25, 2025
8023a91
abstract class for data batch analyzer
alelavml3 Jun 25, 2025
945127b
detection is performed in online or offline according to the monitori…
alelavml3 Jun 25, 2025
27c4733
fix test import but still wrong because it is streaming
alelavml3 Jun 25, 2025
5fca60f
batch drift analyzer
alelavml3 Jun 25, 2025
df80ef2
doc strings
alelavml3 Jun 25, 2025
768b681
test support with polars and pandas
alelavml3 Jun 25, 2025
549a515
Handle new extras in tests; linting according to previous py versions
GiovanniGiacometti Jun 26, 2025
7d1e5db
Parametrize for tests done in a loop
GiovanniGiacometti Jun 26, 2025
fce49d1
Refactor scan method and tests of batch-analyzer
GiovanniGiacometti Jun 30, 2025
fddd086
Improvements to Monitoring Algorithm base class
GiovanniGiacometti Jun 30, 2025
72dd083
Default algorithms builders
GiovanniGiacometti Jun 30, 2025
0b84b97
Refactor classes to accept an instance of algorithms rather than buil…
GiovanniGiacometti Jun 30, 2025
765eb4f
HuggingFace integration uses monitoring modules
GiovanniGiacometti Jun 30, 2025
1eaad45
wip in sklearn general detector
alelavml3 Jul 15, 2025
25869a0
fix tests
alelavml3 Jul 15, 2025
7cbaf46
sklearn detector uses standard monitoring algorithms
alelavml3 Jul 16, 2025
c69b39d
better comment
alelavml3 Jul 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/validation/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ runs:

- name: 🦾 💅 🧪 Install and validate extras
run: |
extras=("sklearn" "huggingface")
extras=("sklearn" "huggingface", "polars", "pandas")

for extra in "${extras[@]}"; do
echo "🦾 Installing extra: $extra"
Expand Down
39 changes: 35 additions & 4 deletions Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,34 @@ set quiet
default:
just --list --unsorted

# --------------------------------------------------
# Developer Setup

# Synchronize the environment by installing all the dependencies
dev-sync:
uv sync --cache-dir .uv_cache --all-extras

# Synchronize the environment by installing the specified extra dependency
# Currently used within the CI to install extra dependencies and test them.
dev-sync-extra extra:
uv sync --cache-dir .uv_cache --extra {{extra}}

# Synchronize the environment by installing all the dependencies except the dev ones
prod-sync:
uv sync --cache-dir .uv_cache --all-extras --no-dev
uv sync --cache-dir .uv_cache --all-extras --no-default-groups

# Synchronize the environment by installing the extra dependency
# specified. Doesn't install the dev dependencies.
prod-sync-extra extra:
uv sync --cache-dir .uv_cache --extra {{extra}} --no-default-groups

# Install the pre-commit hooks
install-hooks:
uv run pre-commit install

# --------------------------------------------------
# Validation

# Run ruff formatting
format:
uv run ruff format
Expand All @@ -31,9 +44,26 @@ lint:
uv run ruff check --fix
uv run mypy --ignore-missing-imports --install-types --non-interactive --package ml3_drift


# Default value for testWorkers is auto (meaning all workers available)
# If you want to pass a custom value (such as 4): `just test testWorkers=4`
# We also run ruff on tests files (it's so fast that it's worth it)

# Little caveat: when running tests with only an extra installed, you'd like
# to avoid having docs dependencies installed (since, for instance, a mkdocs plugin
# requires Pandas, which is one of our extra dependencies). This happens by default
# since docs dependencies are not installed as default dependencies by uv (see pyproject.toml).
# They are only installed when building / serving the documentation. However, if you first
# build the documentation, then run the tests, you will have the docs dependencies installed.
# Should not be a practical problem (especially since in CI environments we don't install docs dependencies),
# but it's worth noting.

# Run the tests with pytest
testWorkers := "auto"
test:
uv run pytest --verbose --color=yes -n auto --exitfirst tests
uv run ruff format tests
uv run ruff check tests --fix
uv run pytest --verbose --color=yes -n {{testWorkers}} --exitfirst tests

# Run linters, formatters and tests
validate: format lint test
Expand All @@ -43,11 +73,12 @@ validate: format lint test

# Generate the documentation
build-docs:
uv run mkdocs build
# Make sure mkdocs is installed
uv run --group docs mkdocs build

# Serve the documentation locally
serve-docs:
uv run mkdocs serve
uv run --group docs mkdocs serve

# --------------------------------------------------
# Publishing
Expand Down
41 changes: 31 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,73 @@ dynamic = ["version"]
license = { text = "Apache-2.0" }
readme = "README.md"

dependencies = []
dependencies = [
"scipy>=1.15.3",
]

# -------------------------------------------------
# Extra dependencies. This package is designed to be
# used within one extra at a time, hence we check each
# extra separately. Remember to update the list of extras
# in the validation action to ensure tests are run
# used with different libraries, which means that our code
# should work only when not all extras are installed.
# Remember to update the list of extras
# in the validation CICD to ensure tests are run
# for your new extra
[project.optional-dependencies]

sklearn = ["scikit-learn>=1.6.1"]

huggingface = ["scipy>=1.15.2", "transformers[torch]>=4.52.3"]

polars = ["polars>=1.31.0"]

pandas = ["pandas>=2.2.3"]


# -------------------------------------------------

[dependency-groups]
dev = [
"ipykernel>=6.29.5",
"mypy>=1.15.0",
"pillow>=11.2.1", # for image support in tests
"pre-commit>=4.1.0",
"pytest>=8.3.4",
"pytest-xdist>=3.6.1",
"ruff>=0.9.5",
# for docs
# for image support in tests
"pillow>=11.2.1",
]

docs = [
"mkdocs-minify-plugin>=0.7.1",
"mkdocs-glightbox>=0.3.4",
"mkdocs-table-reader-plugin>=2.0.1",
"mkdocs-macros-plugin",
"mkdocs>=1.5.0",
"mkdocs-material>=9.5.0",
"mkdocs-material-extensions>=1.1",
"pygments>=2.14",
"pymdown-extensions>=9.9.1",
"jinja2>=3.0",
"markdown>=3.2",
"mkdocs-minify-plugin>=0.7.1",
"mkdocs-glightbox>=0.3.4",
"mkdocs-table-reader-plugin>=2.0.1",
"mkdocs-macros-plugin",
"openpyxl",
]

# -------------------------------------------------

# Default groups for uv
[tool.uv]
default-groups = ["dev"]

# -------------------------------------------------

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.version]
path = "src/ml3_drift/__init__.py"


# Set pytest folder
[tool.pytest.ini_options]
testpaths = ["tests"]
3 changes: 0 additions & 3 deletions ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ exclude = [
line-length = 88
indent-width = 4

# Assume Python 3.9
target-version = "py39"

[lint]
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
Expand Down
Empty file.
Empty file.
163 changes: 163 additions & 0 deletions src/ml3_drift/analysis/analyzer/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from abc import ABC, abstractmethod
import numpy as np
from typing import TYPE_CHECKING, Callable, Union
from typing_extensions import TypeIs

from ml3_drift.analysis.report import Report
from ml3_drift.monitoring.base import MonitoringAlgorithm

if TYPE_CHECKING:
import pandas as pd
import polars as pl

POLARS = True
try:
import polars as pl
except ModuleNotFoundError:
POLARS = False


PANDAS = True
try:
import pandas as pd
except ModuleNotFoundError:
PANDAS = False
Comment thread
GiovanniGiacometti marked this conversation as resolved.


class DataDriftAnalyzer(ABC):
"""
Analyze a dataset identifying the sequence of distributions due to data drifts.

Parameters
----------
continuous_ma_builder: closure function that accepts int parameter as `comparison_window_size`
and returns an instance of a MonitoringAlgorithm
categorical_ma_builder: closure function that accepts int parameter as `comparison_window_size`
and returns an instance of a MonitoringAlgorithm
"""

def __init__(
self,
continuous_ma_builder: Callable[[int], MonitoringAlgorithm],
categorical_ma_builder: Callable[[int], MonitoringAlgorithm],
):
self.continuous_ma_builder = continuous_ma_builder
self.categorical_ma_builder = categorical_ma_builder

def _is_list_str(self, columns: list[str] | list[int]) -> TypeIs[list[str]]:
"""Verify if the input variable is a list of str in any element"""

return all(isinstance(elem, str) for elem in columns)

def _to_index(
self,
X: Union[np.ndarray, "pd.DataFrame", "pl.DataFrame"],
columns: list[str] | list[int] | None,
) -> list[int]:
"""Translate the list of columns in list of indices.

If columns is None then all the indexes are returned.
If columns is list[int] then it is directly returned.
If columns is list[str] then the indexes are retrieved from column names,
in this case X must be a DataFrame."""

if columns is None:
return list(range(X.shape[0]))

if self._is_list_str(columns):
if POLARS and isinstance(X, pl.DataFrame):
return [i for (i, c) in enumerate(X.columns) if c in columns]
elif PANDAS and isinstance(X, pd.DataFrame):
return [i for (i, c) in enumerate(X.columns) if c in columns]
else:
raise ValueError(
f"Type not valid, expecting polars DataFrame or pandas DataFrame when columns has string values. Got {type(X)}"
)
return columns

def _to_numpy(
self, X: Union[np.ndarray, "pd.DataFrame", "pl.DataFrame"]
) -> np.ndarray:
"""Transform input data into numpy array"""

if POLARS and isinstance(X, pl.DataFrame):
return X.to_numpy()
elif PANDAS and isinstance(X, pd.DataFrame):
return X.to_numpy()
elif isinstance(X, np.ndarray):
return X
else:
raise ValueError(
f"Type not valid, expecting numpy array, polars DataFrame or pandas DataFrame. Got {type(X)}"
)

@abstractmethod
def _scan_data(
self,
X: np.ndarray,
y: np.ndarray | None,
continuous_columns_ids: list[int],
categorical_columns_ids: list[int],
y_categorical: bool,
) -> Report:
"""Scan the data to identify different data partitions according to monitoring algorithm."""

def analyze(
self,
X: Union[np.ndarray, "pd.DataFrame", "pl.DataFrame"],
y: Union[None, np.ndarray, "pd.DataFrame", "pl.DataFrame"],
continuous_columns: list[str] | list[int] | None,
categorical_columns: list[str] | list[int] | None,
y_categorical: bool,
) -> Report:
"""Analyze the data to split them into different distribution according to drift detectors.

If target is provided then concept drift is used as split criterion, otherwise, it uses input drift.

Parameters
----------
X: input data. Can be numpy array, pandas dataframe or polars dataframe
y: target data. It is optional and can be numpy array, pandas dataframe or polars dataframe
continuous_columns: if not None it is the indices or names of the columns that are continuous
categorical_columns: if not None it is the indices or names of the columns that are categorical
y_categorical: if True, then the target is categorical, otherwise it is considered as continuous

Output
------
Report object containing information about identified data groups
"""
# Shape check
if y is not None and X.shape[0] != y.shape[0]:
raise ValueError(
f"When target y is not None it must have the same rows of input X. Got X: {X.shape} and y: {y.shape}"
)

# Continuous and categorical columns to canonical form
if continuous_columns is not None:
continuous_columns_ids = self._to_index(X, continuous_columns)
else:
continuous_columns_ids = []

if categorical_columns is not None:
categorical_columns_ids = self._to_index(X, categorical_columns)
Comment thread
GiovanniGiacometti marked this conversation as resolved.
else:
categorical_columns_ids = []
Comment thread
GiovanniGiacometti marked this conversation as resolved.

# Input and target in canonical form
array_X = self._to_numpy(X)

if y is not None:
array_y = self._to_numpy(y)
else:
array_y = None

# Data analysis
report = self._scan_data(
array_X,
array_y,
continuous_columns_ids,
categorical_columns_ids,
y_categorical,
)

return report
Loading