Source code for divik.feature_selection._specialized

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from ._gmm_selector import GMMSelector
from ._outlier import OutlierSelector
from ._percentage_selector import PercentageSelector


[docs]class HighAbundanceAndVarianceSelector(BaseEstimator, SelectorMixin):
    """Feature selector that removes low-mean and low-variance features

    Exercises ``GMMSelector`` to filter out the low-abundance noise features
    and select high-variance informative features.

    This feature selection algorithm looks only at the features (X), not the
    desired outputs (y), and can thus be used for unsupervised learning.

    Parameters
    ----------
    use_log: bool, optional, default: False
        Whether to use the logarithm of feature characteristic instead of the
        characteristic itself. This may improve feature filtering performance,
        depending on the distribution of features, however all the
        characteristics (mean, variance) have to be positive for that -
        filtering will fail otherwise. This is useful for specific cases in
        biology where the distribution of data may actually require this option
        for any efficient filtering.

    min_features: int, optional, default: 1
        How many features must be preserved.

    min_features_rate: float, optional, default: 0.0
        Similar to ``min_features`` but relative to the input data features
        number.

    max_components: int, optional, default: 10
        The maximum number of components used in the GMM decomposition.

    Attributes
    ----------
    abundance_selector_: GMMSelector
        Selector used to filter out the noise component.

    variance_selector_: GMMSelector
        Selector used to filter out the non-informative features.

    selected_: array, shape (n_features,)
        Vector of binary selections of the informative features.

    Examples
    --------
    >>> import numpy as np
    >>> import divik.feature_selection as fs
    >>> np.random.seed(42)
    >>> # Data in this case must be carefully crafted
    >>> labels = np.concatenate([30 * [0] + 20 * [1] + 30 * [2] + 40 * [3]])
    >>> data = np.vstack(100 * [labels * 10.])
    >>> data += np.random.randn(*data.shape)
    >>> sub = data[:, :-40]
    >>> sub += 5 * np.random.randn(*sub.shape)
    >>> # Label 0 has low abundance but high variance
    >>> # Label 3 has low variance but high abundance
    >>> # Label 1 and 2 has not-lowest abundance and high variance
    >>> selector = fs.HighAbundanceAndVarianceSelector().fit(data)
    >>> selector.transform(labels.reshape(1,-1))
    array([[1 1 1 1 1 ...2 2 2]])

    """
    def __init__(self, use_log: bool = False, min_features: int = 1,
                 min_features_rate: float = 0., max_components: int = 10):
        self.use_log = use_log
        self.min_features = min_features
        self.min_features_rate = min_features_rate
        self.max_components = max_components

[docs]    def fit(self, X, y=None):
        """Learn data-driven feature thresholds from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute feature characteristic.

        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        self
        """
        min_features = max(
            self.min_features, self.min_features_rate * X.shape[1])

        self.abundance_selector_ = GMMSelector(
            'mean', use_log=self.use_log, n_candidates=1,
            min_features=min_features, preserve_high=True,
            max_components=self.max_components
        ).fit(X)
        filtered = self.abundance_selector_.transform(X)
        self.selected_ = self.abundance_selector_.selected_.copy()

        self.variance_selector_ = GMMSelector(
            'var', use_log=self.use_log, n_candidates=None,
            min_features=min_features, preserve_high=True,
            max_components=self.max_components
        ).fit(filtered)
        self.selected_[self.selected_] = self.variance_selector_.selected_

        return self

    def _get_support_mask(self):
        """
        Get the boolean mask indicating which features are selected

        Returns
        -------
        support : boolean array of shape [# input features]
            An element is True iff its corresponding feature is selected for
            retention.
        """
        return self.selected_


EPS = 10e-6


# noinspection PyAttributeOutsideInit
[docs]class OutlierAbundanceAndVarianceSelector(BaseEstimator, SelectorMixin):
    def __init__(self, use_log: bool = False, min_features_rate: float = 0.01,
                 p: float = 0.2):
        self.use_log = use_log
        self.min_features_rate = min_features_rate
        self.p = p

[docs]    def fit(self, X, y=None):
        """Learn data-driven feature thresholds from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute feature characteristic.

        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        self
        """
        self.abundance_selector_, a_selected = self._fit_abundance(X)
        filtered = X[:, a_selected]
        self.variance_selector_, v_selected = self._fit_variance(
            filtered, a_selected)
        self.selected_ = a_selected
        self.selected_[a_selected] = v_selected
        return self

    def _fit_abundance(self, X):
        selector = OutlierSelector(stat='mean', use_log=self.use_log,
                                   keep_outliers=False).fit(X)
        selected = selector.selected_
        inlier = selector.vals_[selected][0]
        over_inlier = selector.vals_ > inlier
        selected[over_inlier] = True
        p = selected.mean()
        if p < self.min_features_rate or p >= 1 - EPS:
            selector = PercentageSelector(stat='mean', use_log=self.use_log,
                                          keep_top=True, p=1.0 - self.p).fit(X)
            selected = selector.selected_
        return selector, selected

    def _fit_variance(self, X, old_selected):
        corrected_min = self.min_features_rate / old_selected.mean()
        corrected_p = self.p / old_selected.mean()

        selector = OutlierSelector(stat='var', use_log=self.use_log,
                                   keep_outliers=True).fit(X)
        selected = selector.selected_
        inlier = selector.vals_[selected == 0][0]
        under_inlier = selector.vals_ < inlier
        selected[under_inlier] = False
        p = selected.mean()

        if p < corrected_min or p >= 1 - EPS:
            selector = PercentageSelector(stat='var', use_log=self.use_log,
                                          keep_top=True, p=corrected_p).fit(X)
            selected = selector.selected_
        return selector, selected

    def _get_support_mask(self):
        """
        Get the boolean mask indicating which features are selected

        Returns
        -------
        support : boolean array of shape [# input features]
            An element is True iff its corresponding feature is selected for
            retention.
        """
        return self.selected_