Source code for divik.feature_selection._percentage_selector

import numpy as np
from sklearn.base import BaseEstimator

from divik.core import configurable

from ._stat_selector_mixin import StatSelectorMixin

# noinspection PyAttributeOutsideInit
[docs]@configurable class PercentageSelector(BaseEstimator, StatSelectorMixin): """Feature selector that removes / preserves top some percent of features This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. Parameters ---------- stat: {'mean', 'var'} Kind of statistic to be computed out of the feature. use_log: bool, optional, default: False Whether to use the logarithm of feature characteristic instead of the characteristic itself. This may improve feature filtering performance, depending on the distribution of features, however all the characteristics (mean, variance) have to be positive for that - filtering will fail otherwise. This is useful for specific cases in biology where the distribution of data may actually require this option for any efficient filtering. keep_top: bool, optional, default: True When True, keeps features with highest value of the characteristic. p: float, optional, default: 0.2 Rate of features to keep. Attributes ---------- vals_: array, shape (n_features,) Computed characteristic of each feature. threshold_: float Value of the threshold used for filtering selected_: array, shape (n_features,) Vector of binary selections of the informative features. """ def __init__( self, stat: str, use_log: bool = False, keep_top: bool = True, p: float = 0.2 ): self.stat = stat self.use_log = use_log self.keep_top = keep_top self.p = p
[docs] def fit(self, X, y=None): """Learn data-driven feature thresholds from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute feature characteristic. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ self.vals_ = self._to_characteristics(X) if self.keep_top: self.threshold_ = np.quantile(self.vals_, q=1 - self.p) self.selected_ = self.threshold_ <= self.vals_ else: self.threshold_ = np.quantile(self.vals_, q=self.p) self.selected_ = self.threshold_ >= self.vals_ return self