Source code for divik.sampler._stratified_sampler

import uuid
from contextlib import contextmanager
from typing import Union

from sklearn.model_selection import StratifiedShuffleSplit

from divik.core import configurable, share

from ._core import BaseSampler, ParallelSampler

_DATA = {}


[docs]@configurable class StratifiedSampler(BaseSampler): """Sample the original data preserving proportions of groups Parameters ----------- n_rows : int or float, optional (default 10000) Allows to limit the number of rows in the drawn samples. If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the sample. If int, represents the absolute number of rows. n_samples : int, optional (default None) Allows to limit the number of samples when iterating Attributes ---------- X_ : array_like, shape (n_rows, n_features) Data to sample from y_ : array_like, shape (n_rows,) Group labels """ def __init__(self, n_rows: Union[int, float] = 100, n_samples: int = None): self.n_rows = n_rows self.n_samples = n_samples
[docs] def fit(self, X, y): """Fit the model from data in X. Both inputs are preserved inside to sample from the data. Parameters ---------- X : array-like, shape (n_rows, n_features) Training vector, where n_rows is the number of rows and n_features is the number of features. y: array-like, shape (n_rows,) Returns ------- self : StratifiedSampler Returns the instance itself. """ self.X_ = X self.y_ = y return self
[docs] def get_sample(self, seed): """Return specific sample Sample is drawn from the set of existing rows. A proportion of gorups should be more-or-less the same, depending on the size of the sample. Parameters ---------- seed : int The seed to use to draw the sample Returns ------- sample : array_like, (*self.shape_) Returns the drawn sample """ split = StratifiedShuffleSplit( n_splits=1, train_size=self.n_rows, random_state=seed ) for idx, _ in split.split(self.X_, self.y_): return self.X_[idx]
[docs] @contextmanager def parallel(self): global _DATA ref = str(uuid.uuid4()) with share(self.X_) as X, share(self.y_) as y: _DATA[ref] = ((self.n_rows, self.n_samples), (X, y)) try: yield StratifiedParallelSampler(ref) finally: del _DATA[ref]
class StratifiedParallelSampler(ParallelSampler): def __init__(self, ref): self._ref = ref @property def sampler(self): global _DATA init_args, fit_args = _DATA[self._ref] X, y = fit_args return StratifiedSampler(*init_args).fit(X.value, y.value) def get_sample(self, seed): return self.sampler.get_sample(seed) def initializer(self, *args): global _DATA _DATA[self._ref] = args @property def initargs(self): global _DATA return _DATA[self._ref]