Source code for divik.core._subsets

import numpy as np
from sklearn.model_selection import KFold


[docs]class Subsets: """Scatter dataset to disjoint random subsets and combine them back Parameters ---------- n_splits : int, default 10 Number of subsets that will be generated. random_state : int, default 42 Random state to use for seeding the random number generator. Examples -------- >>> from divik.core import Subsets >>> subsets = Subsets(n_splits=10, random_state=42) >>> X_list = subsets.scatter(X) >>> len(X_list) 10 >>> # do some computations on each subset >>> y = subsets.combine(y_list) """ def __init__(self, n_splits: int = 10, random_state: int = 42): self.n_splits = n_splits self.random_state = random_state def _fold_indices(self, X): kfold = KFold( n_splits=self.n_splits, shuffle=True, random_state=self.random_state ) return [idx for _, idx in kfold.split(X)]
[docs] def scatter(self, X): self._n_rows = X.shape[0] self._idx = self._fold_indices(X) return [X[idx] for idx in self._idx]
def _validate_shape(self, X_list): shapes = [X.shape for X in X_list] if np.sum([s[0] for s in shapes]) != self._n_rows: raise ValueError("Rows number mismatch") dims = [len(s) for s in shapes] if np.min(dims) != np.max(dims): raise ValueError("Inconsistent ndim") data_dims = [s[1:] for s in shapes] if np.any(np.min(data_dims, axis=0) != np.max(data_dims, axis=0)): raise ValueError("Data shape mismatch")
[docs] def combine(self, X_list): self._validate_shape(X_list) X = np.zeros((self._n_rows, *X_list[0].shape[1:]), dtype=X_list[0].dtype) for idx, X_part in zip(self._idx, X_list): X[idx] = X_part return X