88import warnings
99from array import array
1010from itertools import chain
11- from typing import Any , Optional , TypeVar , Union , overload
11+ from typing import Any , Optional , TypeVar , Union
1212
1313if sys .version_info >= (3 , 9 ):
1414 from collections .abc import Callable , Generator
@@ -55,60 +55,31 @@ def __init__(
5555 shuffle : None = None ,
5656 random_state : None = None ,
5757 ) -> None :
58- """K-Folds cross-validator using the Kennard-Stone algorithm .
58+ """Kennard–Stone based K-Fold cross-validator.
5959
6060 Parameters
6161 ----------
62- n_splits : int, optional
63- Number of folds. Must be at least 2., by default 5
62+ n_splits : int, default=5
63+ Number of folds. Must be at least 2.
6464
65- metric : Union[Metrics, Callable[[ArrayLike, ArrayLike], np.ndarray]
66- , optional
65+ metric : {Metrics, callable}, default="euclidean"
66+ Distance metric. Either a metric name accepted by
67+ ``sklearn.metrics.pairwise_distances`` /
68+ ``scipy.spatial.distance.pdist`` or a callable returning an
69+ ``ndarray``. With GPU ('euclidean', 'manhattan', 'chebyshev',
70+ 'minkowski'), ``torch.cdist`` is used.
6771
68- The distance metric to use. See the documentation of
69- - `scipy.spatial.distance.pdist`
70- https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
71- - `sklearn.metrics.pairwise_distances`
72- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
72+ n_jobs : int, default=None
73+ Number of parallel jobs (CPU only).
7374
74- for valid values.
75- , by default "euclidean"
76-
77- Valid values for metric are:
78-
79- - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1',
80- 'l2', 'manhattan']. These metrics support sparse matrix inputs.
81- ['nan_euclidean'] but it does not yet support sparse matrices.
82- - From scipy.spatial.distance: ['braycurtis', 'canberra',
83- 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
84- 'mahalanobis', 'minkowski', 'rogerstanimoto',
85- 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
86- 'sqeuclidean', 'yule'] See the documentation for
87- scipy.spatial.distance for details on these metrics.
88- These metrics do not support sparse matrix inputs.
89-
90- If you want to use GPU when calculating the distance matrix
91- ('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
92- you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
93-
94- n_jobs : int, optional
95- The number of parallel jobs. It is valid only when CPU is used.
96- , by default None
97-
98- device : Literal['cpu', 'cuda', 'mps'] or torch.device or str, optional
99- , by default 'cpu'
100-
101- If you want to use GPU when calculating the distance matrix
102- ('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
103- you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
75+ device : {"cpu", "cuda", "mps"} or torch.device or str, default="cpu"
76+ Device for distance matrix computation.
10477
10578 random_state : None, deprecated
106- This parameter is deprecated and has no effect
107- because the algorithm is deterministic.
79+ No effect (algorithm is deterministic).
10880
10981 shuffle : None, deprecated
110- This parameter is deprecated and has no effect
111- because the algorithm is deterministic.
82+ No effect (algorithm is deterministic).
11283 """
11384 super ().__init__ (n_splits = n_splits , shuffle = False , random_state = None )
11485 self .metric = metric
@@ -160,6 +131,21 @@ def __init__(
160131 n_jobs : Optional [int ] = None ,
161132 device : Device = "cpu" ,
162133 ):
134+ """Splitting helper for train/test based on Kennard–Stone.
135+
136+ Parameters
137+ ----------
138+ n_splits : int, default=1
139+ Must be 1 for this class.
140+
141+ test_size : float or int, default=None
142+ Same semantics as ``sklearn.model_selection.train_test_split``.
143+
144+ train_size : float or int, default=None
145+ Same semantics as ``sklearn.model_selection.train_test_split``.
146+
147+ metric, n_jobs, device : see also ``KFold``
148+ """
163149 super ().__init__ (
164150 n_splits = n_splits , test_size = test_size , train_size = train_size
165151 )
@@ -174,6 +160,7 @@ def __init__(
174160 def _iter_indices (
175161 self , X , y = None , groups = None
176162 ) -> Generator [tuple [list [int ], list [int ]], None , None ]:
163+ """Internal generator. Yields train/test indices."""
177164 ks = _KennardStone (
178165 n_groups = 1 ,
179166 scale = True ,
@@ -209,87 +196,39 @@ def train_test_split(
209196 random_state : None = None ,
210197 shuffle : None = None ,
211198) -> list [T ]:
212- """Split arrays or matrices into train and test subsets using the
213- Kennard-Stone algorithm.
199+ """Split arrays or matrices into train and test subsets via Kennard–Stone.
214200
215- Data partitioning by the Kennard-Stone algorithm is performed based on the
216- first element to be input.
201+ The first input array determines the geometric order of indices so that the
202+ split is as uniform as possible. All subsequent arrays are split using the
203+ same indices.
217204
218205 Parameters
219206 ----------
220- *arrays: sequence of indexables with same length / shape[0]
221- Allowed inputs are lists, numpy arrays, scipy-sparse
222- matrices or pandas dataframes.
223-
224- test_size : float or int, optional
225- If float, should be between 0.0 and 1.0 and represent the proportion
226- of the dataset to include in the test split. If int, represents the
227- absolute number of test samples. If None, the value is set to the
228- complement of the train size. If train_size is also None, it will be
229- set to 0.25., by default None
230-
231- train_size : float or int, optional
232- If float, should be between 0.0 and 1.0 and represent the proportion
233- of the dataset to include in the train split. If int, represents the
234- absolute number of train samples. If None, the value is automatically
235- set to the complement of the test size., by default None
236-
237- metric : Union[Metrics, Callable[[ArrayLike, ArrayLike], np.ndarray]]
238- , optional
239-
240- The distance metric to use. See the documentation of
241- - `scipy.spatial.distance.pdist`
242- https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
243- - `sklearn.metrics.pairwise_distances`
244- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
245-
246- for valid values.
247- , by default "euclidean"
248-
249- Valid values for metric are:
250-
251- - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1',
252- 'l2', 'manhattan']. These metrics support sparse matrix inputs.
253- ['nan_euclidean'] but it does not yet support sparse matrices.
254- - From scipy.spatial.distance: ['braycurtis', 'canberra',
255- 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
256- 'mahalanobis', 'minkowski', 'rogerstanimoto',
257- 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
258- 'sqeuclidean', 'yule'] See the documentation for
259- scipy.spatial.distance for details on these metrics.
260- These metrics do not support sparse matrix inputs.
261-
262- If you want to use GPU when calculating the distance matrix
263- ('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
264- you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
265-
266- n_jobs : int, optional
267- The number of parallel jobs. It is valid only when CPU is used.
268- , by default None
269-
270- device : Literal['cpu', 'cuda', 'mps'] or torch.device or str, optional
271- , by default 'cpu'
272-
273- If you want to use GPU when calculating the distance matrix
274- ('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
275- you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
276-
277- random_state : None, deprecated
278- This parameter is deprecated and has no effect
279- because the algorithm is deterministic.
280-
281- shuffle : None, deprecated
282- This parameter is deprecated and has no effect
283- because the algorithm is deterministic.
207+ *arrays : sequence of indexables
208+ Arrays of equal length (list, ndarray, scipy-sparse, pandas DataFrame, etc.).
209+
210+ test_size : float or int, default=None
211+ Proportion in [0.0, 1.0] or absolute count. If ``None``, it becomes the
212+ complement of ``train_size``. If both are ``None``, defaults to 0.25.
213+
214+ train_size : float or int, default=None
215+ Proportion or absolute count for the train split. If ``None``, becomes
216+ the complement of ``test_size``.
217+
218+ metric, n_jobs, device : see also ``KFold``
219+
220+ random_state, shuffle : None, deprecated
221+ No effect (algorithm is deterministic).
284222
285223 Returns
286224 -------
287- splitting : list, length=2 * len(arrays)
288- List containing train-test split of inputs
225+ list
226+ A list like ``[X_train, X_test, y_train, y_test, ...]``.
289227
290228 Raises
291229 ------
292230 ValueError
231+ If no input arrays are provided.
293232 """
294233 if shuffle is not None :
295234 warnings .warn (
@@ -344,56 +283,17 @@ def __init__(
344283 n_jobs : Optional [int ] = None ,
345284 device : Device = "cpu" ,
346285 ) -> None :
347- """The root program of the Kennard-Stone algorithm,
348- an algorithm for evenly partitioning data.
286+ """Internal class implementing the core of the Kennard–Stone algorithm.
349287
350288 Parameters
351289 ----------
352- n_groups : int, optional
353- how many groups to divide, by default 1
354-
355- scale : bool, optional
356- scaling X or not, by default True
357-
358- metric : Union[Metrics, Callable[[ArrayLike, ArrayLike], np.ndarray]]
359- , optional
360-
361- The distance metric to use. See the documentation of
362- - `scipy.spatial.distance.pdist`
363- https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
364- - `sklearn.metrics.pairwise_distances`
365- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
366-
367- for valid values.
368- , by default "euclidean"
369-
370- Valid values for metric are:
371-
372- - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1',
373- 'l2', 'manhattan']. These metrics support sparse matrix inputs.
374- ['nan_euclidean'] but it does not yet support sparse matrices.
375- - From scipy.spatial.distance: ['braycurtis', 'canberra',
376- 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
377- 'mahalanobis', 'minkowski', 'rogerstanimoto',
378- 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
379- 'sqeuclidean', 'yule'] See the documentation for
380- scipy.spatial.distance for details on these metrics.
381- These metrics do not support sparse matrix inputs.
382-
383- If you want to use GPU when calculating the distance matrix
384- ('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
385- you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
386-
387- n_jobs : int, optional
388- The number of parallel jobs. It is valid only when CPU is used.
389- , by default None
390-
391- device : Literal['cpu', 'cuda', 'mps'] or torch.device or str, optional
392- , by default 'cpu'
393-
394- If you want to use GPU when calculating the distance matrix
395- ('euclidean', 'manhattan', 'chebyshev' and 'minkowski'),
396- you need to install 'pytorch' and set `device` to 'cuda' or 'mps'.
290+ n_groups : int, default=1
291+ Number of groups to split into.
292+
293+ scale : bool, default=True
294+ Whether to standardize features before computing distances.
295+
296+ metric, n_jobs, device : see also ``KFold``
397297 """
398298 self .n_groups = n_groups
399299 self .scale = scale
@@ -402,17 +302,17 @@ def __init__(
402302 self .device = device
403303
404304 def get_indexes (self , X : ArrayLike ) -> list [array [int ]]:
405- """Sort indexes by the Kennard- Stone algorithm .
305+ """Compute index sequences using the Kennard– Stone procedure .
406306
407307 Parameters
408308 ----------
409309 X : ArrayLike
410- The data to be sorted .
310+ 2D array of shape (n_samples, n_features) .
411311
412312 Returns
413313 -------
414314 list[array[int]]
415- The sorted indexes .
315+ A list of index arrays corresponding to each group .
416316 """
417317 # check input array
418318 # scikit-learn 1.6+ deprecates 'force_all_finite' and 1.8 renames to
0 commit comments