Skip to content

Commit 485769a

Browse files
authored
Merge pull request #23 from yu9824/docs/update-docstrings
docs: Update docstrings
2 parents c361890 + afb462d commit 485769a

5 files changed

Lines changed: 127 additions & 194 deletions

File tree

src/kennard_stone/__init__.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,27 @@
1-
"""
2-
This is an algorithm for evenly partitioning data in a `scikit-learn`-like
3-
interface.
1+
"""kennard_stone package
2+
=========================
3+
4+
Utilities for splitting data as uniformly as possible using the
5+
Kennard–Stone algorithm. The package exposes a `scikit-learn`-compatible
6+
interface with ``KFold`` for cross-validation and ``train_test_split`` for
7+
convenient train/test partitioning.
8+
9+
Features
10+
--------
11+
- **KFold**: Kennard–Stone based K-fold cross-validator (non-stratified).
12+
- **train_test_split**: Kennard–Stone based train/test splitting utility.
13+
14+
Examples
15+
--------
16+
>>> from kennard_stone import KFold, train_test_split
17+
>>> X_train, X_test = train_test_split(X, test_size=0.2)
18+
>>> for train_idx, test_idx in KFold(n_splits=5).split(X):
19+
... pass
20+
21+
Notes
22+
-----
23+
Docstrings follow the NumPy/Google style (via napoleon) for Sphinx
24+
autodocumentation.
425
526
Copyright © 2021 yu9824
627
"""
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Core implementations for Kennard–Stone based splitting.
2+
3+
This module exposes the core implementations for data splitting based on the
4+
Kennard–Stone algorithm. Public entry points are ``KFold`` and
5+
``train_test_split``.
6+
7+
Notes
8+
-----
9+
Docstrings are written in the NumPy style for Sphinx + napoleon.
10+
"""

src/kennard_stone/_core/_core.py

Lines changed: 65 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import warnings
99
from array import array
1010
from itertools import chain
11-
from typing import Any, Optional, TypeVar, Union, overload
11+
from typing import Any, Optional, TypeVar, Union
1212

1313
if sys.version_info >= (3, 9):
1414
from collections.abc import Callable, Generator
@@ -55,60 +55,31 @@ def __init__(
5555
shuffle: None = None,
5656
random_state: None = None,
5757
) -> None:
58-
"""K-Folds cross-validator using the Kennard-Stone algorithm.
58+
"""Kennard–Stone based K-Fold cross-validator.
5959
6060
Parameters
6161
----------
62-
n_splits : int, optional
63-
Number of folds. Must be at least 2., by default 5
62+
n_splits : int, default=5
63+
Number of folds. Must be at least 2.
6464
65-
metric : Union[Metrics, Callable[[ArrayLike, ArrayLike], np.ndarray]
66-
, optional
65+
metric : {Metrics, callable}, default="euclidean"
66+
Distance metric. Either a metric name accepted by
67+
``sklearn.metrics.pairwise_distances`` /
68+
``scipy.spatial.distance.pdist`` or a callable returning an
69+
``ndarray``. With GPU ('euclidean', 'manhattan', 'chebyshev',
70+
'minkowski'), ``torch.cdist`` is used.
6771
68-
The distance metric to use. See the documentation of
69-
- `scipy.spatial.distance.pdist`
70-
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
71-
- `sklearn.metrics.pairwise_distances`
72-
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
72+
n_jobs : int, default=None
73+
Number of parallel jobs (CPU only).
7374
74-
for valid values.
75-
, by default "euclidean"
76-
77-
Valid values for metric are:
78-
79-
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1',
80-
'l2', 'manhattan']. These metrics support sparse matrix inputs.
81-
['nan_euclidean'] but it does not yet support sparse matrices.
82-
- From scipy.spatial.distance: ['braycurtis', 'canberra',
83-
'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
84-
'mahalanobis', 'minkowski', 'rogerstanimoto',
85-
'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
86-
'sqeuclidean', 'yule'] See the documentation for
87-
scipy.spatial.distance for details on these metrics.
88-
These metrics do not support sparse matrix inputs.
89-
90-
If you want to use GPU when calculating the distance matrix
91-
('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
92-
you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
93-
94-
n_jobs : int, optional
95-
The number of parallel jobs. It is valid only when CPU is used.
96-
, by default None
97-
98-
device : Literal['cpu', 'cuda', 'mps'] or torch.device or str, optional
99-
, by default 'cpu'
100-
101-
If you want to use GPU when calculating the distance matrix
102-
('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
103-
you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
75+
device : {"cpu", "cuda", "mps"} or torch.device or str, default="cpu"
76+
Device for distance matrix computation.
10477
10578
random_state : None, deprecated
106-
This parameter is deprecated and has no effect
107-
because the algorithm is deterministic.
79+
No effect (algorithm is deterministic).
10880
10981
shuffle : None, deprecated
110-
This parameter is deprecated and has no effect
111-
because the algorithm is deterministic.
82+
No effect (algorithm is deterministic).
11283
"""
11384
super().__init__(n_splits=n_splits, shuffle=False, random_state=None)
11485
self.metric = metric
@@ -160,6 +131,21 @@ def __init__(
160131
n_jobs: Optional[int] = None,
161132
device: Device = "cpu",
162133
):
134+
"""Splitting helper for train/test based on Kennard–Stone.
135+
136+
Parameters
137+
----------
138+
n_splits : int, default=1
139+
Must be 1 for this class.
140+
141+
test_size : float or int, default=None
142+
Same semantics as ``sklearn.model_selection.train_test_split``.
143+
144+
train_size : float or int, default=None
145+
Same semantics as ``sklearn.model_selection.train_test_split``.
146+
147+
metric, n_jobs, device : see also ``KFold``
148+
"""
163149
super().__init__(
164150
n_splits=n_splits, test_size=test_size, train_size=train_size
165151
)
@@ -174,6 +160,7 @@ def __init__(
174160
def _iter_indices(
175161
self, X, y=None, groups=None
176162
) -> Generator[tuple[list[int], list[int]], None, None]:
163+
"""Internal generator. Yields train/test indices."""
177164
ks = _KennardStone(
178165
n_groups=1,
179166
scale=True,
@@ -209,87 +196,39 @@ def train_test_split(
209196
random_state: None = None,
210197
shuffle: None = None,
211198
) -> list[T]:
212-
"""Split arrays or matrices into train and test subsets using the
213-
Kennard-Stone algorithm.
199+
"""Split arrays or matrices into train and test subsets via Kennard–Stone.
214200
215-
Data partitioning by the Kennard-Stone algorithm is performed based on the
216-
first element to be input.
201+
The first input array determines the geometric order of indices so that the
202+
split is as uniform as possible. All subsequent arrays are split using the
203+
same indices.
217204
218205
Parameters
219206
----------
220-
*arrays: sequence of indexables with same length / shape[0]
221-
Allowed inputs are lists, numpy arrays, scipy-sparse
222-
matrices or pandas dataframes.
223-
224-
test_size : float or int, optional
225-
If float, should be between 0.0 and 1.0 and represent the proportion
226-
of the dataset to include in the test split. If int, represents the
227-
absolute number of test samples. If None, the value is set to the
228-
complement of the train size. If train_size is also None, it will be
229-
set to 0.25., by default None
230-
231-
train_size : float or int, optional
232-
If float, should be between 0.0 and 1.0 and represent the proportion
233-
of the dataset to include in the train split. If int, represents the
234-
absolute number of train samples. If None, the value is automatically
235-
set to the complement of the test size., by default None
236-
237-
metric : Union[Metrics, Callable[[ArrayLike, ArrayLike], np.ndarray]]
238-
, optional
239-
240-
The distance metric to use. See the documentation of
241-
- `scipy.spatial.distance.pdist`
242-
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
243-
- `sklearn.metrics.pairwise_distances`
244-
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
245-
246-
for valid values.
247-
, by default "euclidean"
248-
249-
Valid values for metric are:
250-
251-
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1',
252-
'l2', 'manhattan']. These metrics support sparse matrix inputs.
253-
['nan_euclidean'] but it does not yet support sparse matrices.
254-
- From scipy.spatial.distance: ['braycurtis', 'canberra',
255-
'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
256-
'mahalanobis', 'minkowski', 'rogerstanimoto',
257-
'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
258-
'sqeuclidean', 'yule'] See the documentation for
259-
scipy.spatial.distance for details on these metrics.
260-
These metrics do not support sparse matrix inputs.
261-
262-
If you want to use GPU when calculating the distance matrix
263-
('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
264-
you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
265-
266-
n_jobs : int, optional
267-
The number of parallel jobs. It is valid only when CPU is used.
268-
, by default None
269-
270-
device : Literal['cpu', 'cuda', 'mps'] or torch.device or str, optional
271-
, by default 'cpu'
272-
273-
If you want to use GPU when calculating the distance matrix
274-
('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
275-
you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
276-
277-
random_state : None, deprecated
278-
This parameter is deprecated and has no effect
279-
because the algorithm is deterministic.
280-
281-
shuffle : None, deprecated
282-
This parameter is deprecated and has no effect
283-
because the algorithm is deterministic.
207+
*arrays : sequence of indexables
208+
Arrays of equal length (list, ndarray, scipy-sparse, pandas DataFrame, etc.).
209+
210+
test_size : float or int, default=None
211+
Proportion in [0.0, 1.0] or absolute count. If ``None``, it becomes the
212+
complement of ``train_size``. If both are ``None``, defaults to 0.25.
213+
214+
train_size : float or int, default=None
215+
Proportion or absolute count for the train split. If ``None``, becomes
216+
the complement of ``test_size``.
217+
218+
metric, n_jobs, device : see also ``KFold``
219+
220+
random_state, shuffle : None, deprecated
221+
No effect (algorithm is deterministic).
284222
285223
Returns
286224
-------
287-
splitting : list, length=2 * len(arrays)
288-
List containing train-test split of inputs
225+
list
226+
A list like ``[X_train, X_test, y_train, y_test, ...]``.
289227
290228
Raises
291229
------
292230
ValueError
231+
If no input arrays are provided.
293232
"""
294233
if shuffle is not None:
295234
warnings.warn(
@@ -344,56 +283,17 @@ def __init__(
344283
n_jobs: Optional[int] = None,
345284
device: Device = "cpu",
346285
) -> None:
347-
"""The root program of the Kennard-Stone algorithm,
348-
an algorithm for evenly partitioning data.
286+
"""Internal class implementing the core of the Kennard–Stone algorithm.
349287
350288
Parameters
351289
----------
352-
n_groups : int, optional
353-
how many groups to divide, by default 1
354-
355-
scale : bool, optional
356-
scaling X or not, by default True
357-
358-
metric : Union[Metrics, Callable[[ArrayLike, ArrayLike], np.ndarray]]
359-
, optional
360-
361-
The distance metric to use. See the documentation of
362-
- `scipy.spatial.distance.pdist`
363-
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
364-
- `sklearn.metrics.pairwise_distances`
365-
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
366-
367-
for valid values.
368-
, by default "euclidean"
369-
370-
Valid values for metric are:
371-
372-
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1',
373-
'l2', 'manhattan']. These metrics support sparse matrix inputs.
374-
['nan_euclidean'] but it does not yet support sparse matrices.
375-
- From scipy.spatial.distance: ['braycurtis', 'canberra',
376-
'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
377-
'mahalanobis', 'minkowski', 'rogerstanimoto',
378-
'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
379-
'sqeuclidean', 'yule'] See the documentation for
380-
scipy.spatial.distance for details on these metrics.
381-
These metrics do not support sparse matrix inputs.
382-
383-
If you want to use GPU when calculating the distance matrix
384-
('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
385-
you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
386-
387-
n_jobs : int, optional
388-
The number of parallel jobs. It is valid only when CPU is used.
389-
, by default None
390-
391-
device : Literal['cpu', 'cuda', 'mps'] or torch.device or str, optional
392-
, by default 'cpu'
393-
394-
If you want to use GPU when calculating the distance matrix
395-
('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
396-
you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
290+
n_groups : int, default=1
291+
Number of groups to split into.
292+
293+
scale : bool, default=True
294+
Whether to standardize features before computing distances.
295+
296+
metric, n_jobs, device : see also ``KFold``
397297
"""
398298
self.n_groups = n_groups
399299
self.scale = scale
@@ -402,17 +302,17 @@ def __init__(
402302
self.device = device
403303

404304
def get_indexes(self, X: ArrayLike) -> list[array[int]]:
405-
"""Sort indexes by the Kennard-Stone algorithm.
305+
"""Compute index sequences using the KennardStone procedure.
406306
407307
Parameters
408308
----------
409309
X : ArrayLike
410-
The data to be sorted.
310+
2D array of shape (n_samples, n_features).
411311
412312
Returns
413313
-------
414314
list[array[int]]
415-
The sorted indexes.
315+
A list of index arrays corresponding to each group.
416316
"""
417317
# check input array
418318
# scikit-learn 1.6+ deprecates 'force_all_finite' and 1.8 renames to

0 commit comments

Comments
 (0)