Source code for divik.feature_selection._specialized

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from ._gmm_selector import GMMSelector
from ._outlier import OutlierSelector
from ._percentage_selector import PercentageSelector


[docs]class HighAbundanceAndVarianceSelector(BaseEstimator, SelectorMixin): """Feature selector that removes low-mean and low-variance features Exercises ``GMMSelector`` to filter out the low-abundance noise features and select high-variance informative features. This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. Parameters ---------- use_log: bool, optional, default: False Whether to use the logarithm of feature characteristic instead of the characteristic itself. This may improve feature filtering performance, depending on the distribution of features, however all the characteristics (mean, variance) have to be positive for that - filtering will fail otherwise. This is useful for specific cases in biology where the distribution of data may actually require this option for any efficient filtering. min_features: int, optional, default: 1 How many features must be preserved. min_features_rate: float, optional, default: 0.0 Similar to ``min_features`` but relative to the input data features number. max_components: int, optional, default: 10 The maximum number of components used in the GMM decomposition. Attributes ---------- abundance_selector_: GMMSelector Selector used to filter out the noise component. variance_selector_: GMMSelector Selector used to filter out the non-informative features. selected_: array, shape (n_features,) Vector of binary selections of the informative features. Examples -------- >>> import numpy as np >>> import divik.feature_selection as fs >>> np.random.seed(42) >>> # Data in this case must be carefully crafted >>> labels = np.concatenate([30 * [0] + 20 * [1] + 30 * [2] + 40 * [3]]) >>> data = np.vstack(100 * [labels * 10.]) >>> data += np.random.randn(*data.shape) >>> sub = data[:, :-40] >>> sub += 5 * np.random.randn(*sub.shape) >>> # Label 0 has low abundance but high variance >>> # Label 3 has low variance but high abundance >>> # Label 1 and 2 has not-lowest abundance and high variance >>> selector = fs.HighAbundanceAndVarianceSelector().fit(data) >>> selector.transform(labels.reshape(1,-1)) array([[1 1 1 1 1 ...2 2 2]]) """ def __init__(self, use_log: bool = False, min_features: int = 1, min_features_rate: float = 0., max_components: int = 10): self.use_log = use_log self.min_features = min_features self.min_features_rate = min_features_rate self.max_components = max_components
[docs] def fit(self, X, y=None): """Learn data-driven feature thresholds from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute feature characteristic. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ min_features = max( self.min_features, self.min_features_rate * X.shape[1]) self.abundance_selector_ = GMMSelector( 'mean', use_log=self.use_log, n_candidates=1, min_features=min_features, preserve_high=True, max_components=self.max_components ).fit(X) filtered = self.abundance_selector_.transform(X) self.selected_ = self.abundance_selector_.selected_.copy() self.variance_selector_ = GMMSelector( 'var', use_log=self.use_log, n_candidates=None, min_features=min_features, preserve_high=True, max_components=self.max_components ).fit(filtered) self.selected_[self.selected_] = self.variance_selector_.selected_ return self
def _get_support_mask(self): """ Get the boolean mask indicating which features are selected Returns ------- support : boolean array of shape [# input features] An element is True iff its corresponding feature is selected for retention. """ return self.selected_
EPS = 10e-6 # noinspection PyAttributeOutsideInit
[docs]class OutlierAbundanceAndVarianceSelector(BaseEstimator, SelectorMixin): def __init__(self, use_log: bool = False, min_features_rate: float = 0.01, p: float = 0.2): self.use_log = use_log self.min_features_rate = min_features_rate self.p = p
[docs] def fit(self, X, y=None): """Learn data-driven feature thresholds from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute feature characteristic. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ self.abundance_selector_, a_selected = self._fit_abundance(X) filtered = X[:, a_selected] self.variance_selector_, v_selected = self._fit_variance( filtered, a_selected) self.selected_ = a_selected self.selected_[a_selected] = v_selected return self
def _fit_abundance(self, X): selector = OutlierSelector(stat='mean', use_log=self.use_log, keep_outliers=False).fit(X) selected = selector.selected_ inlier = selector.vals_[selected][0] over_inlier = selector.vals_ > inlier selected[over_inlier] = True p = selected.mean() if p < self.min_features_rate or p >= 1 - EPS: selector = PercentageSelector(stat='mean', use_log=self.use_log, keep_top=True, p=1.0 - self.p).fit(X) selected = selector.selected_ return selector, selected def _fit_variance(self, X, old_selected): corrected_min = self.min_features_rate / old_selected.mean() corrected_p = self.p / old_selected.mean() selector = OutlierSelector(stat='var', use_log=self.use_log, keep_outliers=True).fit(X) selected = selector.selected_ inlier = selector.vals_[selected == 0][0] under_inlier = selector.vals_ < inlier selected[under_inlier] = False p = selected.mean() if p < corrected_min or p >= 1 - EPS: selector = PercentageSelector(stat='var', use_log=self.use_log, keep_top=True, p=corrected_p).fit(X) selected = selector.selected_ return selector, selected def _get_support_mask(self): """ Get the boolean mask indicating which features are selected Returns ------- support : boolean array of shape [# input features] An element is True iff its corresponding feature is selected for retention. """ return self.selected_