Source code for divik.feature_selection._gmm_selector

import numpy as np
from sklearn.base import BaseEstimator

import divik._matlab_legacy as ml
from ._stat_selector_mixin import StatSelectorMixin


[docs]class GMMSelector(BaseEstimator, StatSelectorMixin): """Feature selector that removes low- or high- mean or variance features Gaussian Mixture Modeling is applied to the features' characteristics and components are obtained. Crossing points of the components are considered candidate thresholds. Out of these up to ``n_candidates`` components are removed in such a way that at least ``min_features`` or ``min_features_rate`` features are retained. This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. Parameters ---------- stat: {'mean', 'var'} Kind of statistic to be computed out of the feature. use_log: bool, optional, default: False Whether to use the logarithm of feature characteristic instead of the characteristic itself. This may improve feature filtering performance, depending on the distribution of features, however all the characteristics (mean, variance) have to be positive for that - filtering will fail otherwise. This is useful for specific cases in biology where the distribution of data may actually require this option for any efficient filtering. n_candidates: int, optional, default: None How many candidate thresholds to use at most. ``0`` preserves all the features (all candidate thresholds are discarded), ``None`` allows to remove all but one component (all candidate thresholds are retained). Negative value means to discard up to all but ``-n_candidates`` candidates, e.g. ``-1`` will retain at least two components (one candidate threshold is removed). min_features: int, optional, default: 1 How many features must be preserved. Candidate thresholds are tested against this value, and if they retain less features, less conservative thresholds is selected. min_features_rate: float, optional, default: 0.0 Similar to ``min_features`` but relative to the input data features number. preserve_high: bool, optional, default: True Whether to preserve the high-characteristic features or low-characteristic ones. max_components: int, optional, default: 10 The maximum number of components used in the GMM decomposition. Attributes ---------- vals_: array, shape (n_features,) Computed characteristic of each feature. threshold_: float Threshold value to filter the features by the characteristic. raw_threshold_: float Threshold value mapped back to characteristic space (no logarithm, etc.) selected_: array, shape (n_features,) Vector of binary selections of the informative features. Examples -------- >>> import numpy as np >>> import divik.feature_selection as fs >>> np.random.seed(42) >>> labels = np.concatenate([30 * [0] + 20 * [1] + 30 * [2] + 40 * [3]]) >>> data = labels * 5 + np.random.randn(*labels.shape) >>> fs.GMMSelector('mean').fit_transform(data) array([[14.78032811 15.35711257 ... 15.75193303]]) >>> fs.GMMSelector('mean', preserve_high=False).fit_transform(data) array([[ 0.49671415 -0.1382643 ... -0.29169375]]) >>> fs.GMMSelector('mean', n_discard=-1).fit_transform(data) array([[10.32408397 9.61491772 ... 15.75193303]]) """ def __init__(self, stat: str, use_log: bool = False, n_candidates: int = None, min_features: int = 1, min_features_rate: float = .0, preserve_high: bool = True, max_components: int = 10): if stat not in {'mean', 'var'}: raise ValueError('stat must be one of {"mean", "var"}') self.stat = stat self.use_log = use_log self.n_candidates = n_candidates self.min_features = min_features self.min_features_rate = min_features_rate self.preserve_high = preserve_high self.max_components = max_components
[docs] def fit(self, X, y=None): """Learn data-driven feature thresholds from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute feature characteristic. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ self.vals_ = self._to_characteristics(X) thrs = ml.find_thresholds( # the translation is due to MATLAB's problem self.vals_ - self.vals_.min(), max_components=self.max_components, throw_on_engine_error=False) + self.vals_.min() n_candidates = len(thrs) if self.n_candidates is None \ else self.n_candidates desired_thrs = thrs[:n_candidates] min_features = max( self.min_features, self.min_features_rate * X.shape[1]) for thr in reversed(desired_thrs): selected = self.vals_ >= thr if selected.sum() >= min_features: break else: selected = np.ones((X.shape[1],), dtype=bool) thr = -np.inf self.threshold_ = thr self.raw_threshold_ = self._to_raw(thr) self.selected_ = selected return self