ml-cube · alelavml3 · Jul 16, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/.github/actions/validation/action.yml b/.github/actions/validation/action.yml
@@ -17,7 +17,7 @@ runs:
 
     - name: 🦾 💅 🧪 Install and validate extras
       run: |
-        extras=("sklearn" "huggingface")
+        extras=("sklearn" "huggingface", "polars", "pandas")
 
         for extra in "${extras[@]}"; do
           echo "🦾 Installing extra: $extra"

diff --git a/Justfile b/Justfile
@@ -7,21 +7,34 @@ set quiet
 default:
   just --list --unsorted
 
+# --------------------------------------------------
+# Developer Setup
+
 # Synchronize the environment by installing all the dependencies
 dev-sync:
     uv sync --cache-dir .uv_cache --all-extras
 
+# Synchronize the environment by installing the specified extra dependency
+# Currently used within the CI to install extra dependencies and test them.
 dev-sync-extra extra:
 	uv sync --cache-dir .uv_cache --extra {{extra}}
 
 # Synchronize the environment by installing all the dependencies except the dev ones
 prod-sync:
-	uv sync --cache-dir .uv_cache --all-extras --no-dev
+	uv sync --cache-dir .uv_cache --all-extras --no-default-groups
+
+# Synchronize the environment by installing the extra dependency
+# specified. Doesn't install the dev dependencies.
+prod-sync-extra extra:
+	uv sync --cache-dir .uv_cache --extra {{extra}} --no-default-groups
 
 # Install the pre-commit hooks
 install-hooks:
 	uv run pre-commit install
 
+# --------------------------------------------------
+# Validation
+
 # Run ruff formatting
 format:
 	uv run ruff format
@@ -31,9 +44,26 @@ lint:
 	uv run ruff check --fix
 	uv run mypy --ignore-missing-imports --install-types --non-interactive --package ml3_drift
 
+
+# Default value for testWorkers is auto (meaning all workers available)
+# If you want to pass a custom value (such as 4): `just testWorkers=4 test`
+# We also run ruff on tests files (it's so fast that it's worth it)
+
+# Little caveat: when running tests with only an extra installed, you'd like
+# to avoid having docs dependencies installed (since, for instance, a mkdocs plugin
+# requires Pandas, which is one of our extra dependencies). This happens by default
+# since docs dependencies are not installed as default dependencies by uv (see pyproject.toml).
+# They are only installed when building / serving the documentation. However, if you first
+# build the documentation, then run the tests, you will have the docs dependencies installed.
+# Should not be a practical problem (especially since in CI environments we don't install docs dependencies),
+# but it's worth noting.
+
 # Run the tests with pytest
+testWorkers := "auto"
 test:
-	uv run pytest --verbose --color=yes -n auto --exitfirst tests
+    uv run ruff format tests
+    uv run ruff check tests --fix
+    uv run pytest --verbose --color=yes -n {{testWorkers}} --exitfirst tests
 
 # Run linters, formatters and tests
 validate: format lint test
@@ -43,11 +73,12 @@ validate: format lint test
 
 # Generate the documentation
 build-docs:
-	uv run mkdocs build
+    # Make sure mkdocs is installed
+    uv run --group docs mkdocs build
 
 # Serve the documentation locally
 serve-docs:
-	uv run mkdocs serve
+    uv run --group docs mkdocs serve
 
 # --------------------------------------------------
 # Publishing

diff --git a/examples/huggingface/text_embedding_monitoring.py b/examples/huggingface/text_embedding_monitoring.py
@@ -3,7 +3,8 @@
 from ml3_drift.huggingface.drift_detection_pipeline import (
     HuggingFaceDriftDetectionPipeline,
 )
-from ml3_drift.huggingface.univariate.ks import KSDriftDetector
+from ml3_drift.monitoring.multivariate.bonferroni import BonferroniCorrectionAlgorithm
+from ml3_drift.monitoring.univariate.continuous.ks import KSAlgorithm
 from ml3_drift.callbacks.base import logger_callback
 
 
@@ -37,7 +38,8 @@
     # to monitor the drift in the embeddings.
 
     hf_pipe = HuggingFaceDriftDetectionPipeline(
-        drift_detector=KSDriftDetector(
+        drift_detector=BonferroniCorrectionAlgorithm(
+            algorithm=KSAlgorithm(p_value=0.05),
             callbacks=[
                 partial(
                     logger_callback,

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,52 +7,73 @@ dynamic = ["version"]
 license = { text = "Apache-2.0" }
 readme = "README.md"
 
-dependencies = []
+dependencies = [
+    "scipy>=1.15.3",
+]
 
 # -------------------------------------------------
 # Extra dependencies. This package is designed to be
-# used within one extra at a time, hence we check each
-# extra separately. Remember to update the list of extras
-# in the validation action to ensure tests are run
+# used with different libraries, which means that our code
+# should work only when not all extras are installed.
+# Remember to update the list of extras
+# in the validation CICD to ensure tests are run
 # for your new extra
 [project.optional-dependencies]
 
 sklearn = ["scikit-learn>=1.6.1"]
 
 huggingface = ["scipy>=1.15.2", "transformers[torch]>=4.52.3"]
 
+polars = ["polars>=1.31.0"]
+
+pandas = ["pandas>=2.2.3"]
+
 
 # -------------------------------------------------
 
 [dependency-groups]
 dev = [
     "ipykernel>=6.29.5",
     "mypy>=1.15.0",
-    "pillow>=11.2.1",      # for image support in tests
     "pre-commit>=4.1.0",
     "pytest>=8.3.4",
     "pytest-xdist>=3.6.1",
     "ruff>=0.9.5",
-    # for docs
+    # for image support in tests
+    "pillow>=11.2.1",
+]
+
+docs = [
+    "mkdocs-minify-plugin>=0.7.1",
+    "mkdocs-glightbox>=0.3.4",
+    "mkdocs-table-reader-plugin>=2.0.1",
+    "mkdocs-macros-plugin",
     "mkdocs>=1.5.0",
     "mkdocs-material>=9.5.0",
     "mkdocs-material-extensions>=1.1",
     "pygments>=2.14",
     "pymdown-extensions>=9.9.1",
     "jinja2>=3.0",
     "markdown>=3.2",
-    "mkdocs-minify-plugin>=0.7.1",
-    "mkdocs-glightbox>=0.3.4",
-    "mkdocs-table-reader-plugin>=2.0.1",
-    "mkdocs-macros-plugin",
     "openpyxl",
 ]
 
 # -------------------------------------------------
 
+# Default groups for uv
+[tool.uv]
+default-groups = ["dev"]
+
+# -------------------------------------------------
+
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.version]
 path = "src/ml3_drift/__init__.py"
+
+
+# Set pytest folder
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/ruff.toml b/ruff.toml
@@ -33,9 +33,6 @@ exclude = [
 line-length = 88
 indent-width = 4
 
-# Assume Python 3.9
-target-version = "py39"
-
 [lint]
 # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
 # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or

diff --git a/..._drift/huggingface/univariate/__init__.py → src/ml3_drift/analysis/__init__.py b/..._drift/huggingface/univariate/__init__.py → src/ml3_drift/analysis/__init__.py
diff --git a/src/ml3_drift/sklearn/univariate/__init__.py → src/ml3_drift/analysis/analyzer/__init__.py b/src/ml3_drift/sklearn/univariate/__init__.py → src/ml3_drift/analysis/analyzer/__init__.py
diff --git a/src/ml3_drift/analysis/analyzer/base.py b/src/ml3_drift/analysis/analyzer/base.py
@@ -0,0 +1,178 @@
+from abc import ABC, abstractmethod
+import numpy as np
+from typing import TYPE_CHECKING, Union
+from typing_extensions import TypeIs
+
+from ml3_drift.analysis.report import Report
+from ml3_drift.monitoring.base import MonitoringAlgorithm
+from ml3_drift.monitoring.multivariate.bonferroni import BonferroniCorrectionAlgorithm
+from ml3_drift.monitoring.univariate.continuous.ks import KSAlgorithm
+from ml3_drift.monitoring.univariate.discrete.chi_square import (
+    ChiSquareAlgorithm,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+    import polars as pl
+
+POLARS = True
+try:
+    import polars as pl
+except ModuleNotFoundError:
+    POLARS = False
+
+
+PANDAS = True
+try:
+    import pandas as pd
+except ModuleNotFoundError:
+    PANDAS = False
+
+
+class DataDriftAnalyzer(ABC):
+    """
+    Analyze a dataset identifying the sequence of distributions due to data drifts.
+
+    Parameters
+    ----------
+    continuous_monitoring_algorithm: MonitoringAlgorithm | None
+        Algorithm used to monitor continuous data. If None, a default algorithm is used.
+    categorical_monitoring_algorithm: MonitoringAlgorithm | None
+        Algorithm used to monitor categorical data. If None, a default algorithm is used.
+    """
+
+    def __init__(
+        self,
+        continuous_monitoring_algorithm: MonitoringAlgorithm | None = None,
+        categorical_monitoring_algorithm: MonitoringAlgorithm | None = None,
+    ):
+        # We use default algorithms if None is provided.
+        if continuous_monitoring_algorithm is None:
+            continuous_monitoring_algorithm = BonferroniCorrectionAlgorithm(
+                algorithm=KSAlgorithm(),
+            )
+        if categorical_monitoring_algorithm is None:
+            categorical_monitoring_algorithm = BonferroniCorrectionAlgorithm(
+                algorithm=ChiSquareAlgorithm(),
+            )
+
+        self.continuous_monitoring_algorithm = continuous_monitoring_algorithm
+        self.categorical_monitoring_algorithm = categorical_monitoring_algorithm
+
+    def _is_list_str(self, columns: list[str] | list[int]) -> TypeIs[list[str]]:
+        """Verify if the input variable is a list of str in any element"""
+
+        return all(isinstance(elem, str) for elem in columns)
+
+    def _to_index(
+        self,
+        X: Union[np.ndarray, "pd.DataFrame", "pl.DataFrame"],
+        columns: list[str] | list[int] | None,
+    ) -> list[int]:
+        """Translate the list of columns in list of indices.
+
+        If columns is None then all the indexes are returned.
+        If columns is list[int] then it is directly returned.
+        If columns is list[str] then the indexes are retrieved from column names,
+        in this case X must be a DataFrame."""
+
+        if columns is None:
+            return list(range(X.shape[0]))
+
+        if self._is_list_str(columns):
+            if POLARS and isinstance(X, pl.DataFrame):
+                return [i for (i, c) in enumerate(X.columns) if c in columns]
+            elif PANDAS and isinstance(X, pd.DataFrame):
+                return [i for (i, c) in enumerate(X.columns) if c in columns]
+            else:
+                raise ValueError(
+                    f"Type not valid, expecting polars DataFrame or pandas DataFrame when columns has string values. Got {type(X)}"
+                )
+        return columns
+
+    def _to_numpy(
+        self, X: Union[np.ndarray, "pd.DataFrame", "pl.DataFrame"]
+    ) -> np.ndarray:
+        """Transform input data into numpy array"""
+
+        if POLARS and isinstance(X, pl.DataFrame):
+            return X.to_numpy()
+        elif PANDAS and isinstance(X, pd.DataFrame):
+            return X.to_numpy()
+        elif isinstance(X, np.ndarray):
+            return X
+        else:
+            raise ValueError(
+                f"Type not valid, expecting numpy array, polars DataFrame or pandas DataFrame. Got {type(X)}"
+            )
+
+    @abstractmethod
+    def _scan_data(
+        self,
+        X: np.ndarray,
+        y: np.ndarray | None,
+        continuous_columns_ids: list[int],
+        categorical_columns_ids: list[int],
+        y_categorical: bool,
+    ) -> Report:
+        """Scan the data to identify different data partitions according to monitoring algorithm."""
+
+    def analyze(
+        self,
+        X: Union[np.ndarray, "pd.DataFrame", "pl.DataFrame"],
+        y: Union[None, np.ndarray, "pd.DataFrame", "pl.DataFrame"],
+        continuous_columns: list[str] | list[int] | None,
+        categorical_columns: list[str] | list[int] | None,
+        y_categorical: bool,
+    ) -> Report:
+        """Analyze the data to split them into different distribution according to drift detectors.
+
+        If target is provided then concept drift is used as split criterion, otherwise, it uses input drift.
+
+        Parameters
+        ----------
+        X: input data. Can be numpy array, pandas dataframe or polars dataframe
+        y: target data. It is optional and can be numpy array, pandas dataframe or polars dataframe
+        continuous_columns: if not None it is the indices or names of the columns that are continuous
+        categorical_columns: if not None it is the indices or names of the columns that are categorical
+        y_categorical: if True, then the target is categorical, otherwise it is considered as continuous
+
+        Output
+        ------
+        Report object containing information about identified data groups
+        """
+        # Shape check
+        if y is not None and X.shape[0] != y.shape[0]:
+            raise ValueError(
+                f"When target y is not None it must have the same rows of input X. Got X: {X.shape} and y: {y.shape}"
+            )
+
+        # Continuous and categorical columns to canonical form
+        if continuous_columns is not None:
+            continuous_columns_ids = self._to_index(X, continuous_columns)
+        else:
+            continuous_columns_ids = []
+
+        if categorical_columns is not None:
+            categorical_columns_ids = self._to_index(X, categorical_columns)
+        else:
+            categorical_columns_ids = []
+
+        # Input and target in canonical form
+        array_X = self._to_numpy(X)
+
+        if y is not None:
+            array_y = self._to_numpy(y)
+        else:
+            array_y = None
+
+        # Data analysis
+        report = self._scan_data(
+            array_X,
+            array_y,
+            continuous_columns_ids,
+            categorical_columns_ids,
+            y_categorical,
+        )
+
+        return report