Source code for divik.feature_selection._percentage_selector

import numpy as np
from sklearn.base import BaseEstimator

from divik.core import configurable

from ._stat_selector_mixin import StatSelectorMixin


# noinspection PyAttributeOutsideInit
[docs]@configurable
class PercentageSelector(BaseEstimator, StatSelectorMixin):
    """Feature selector that removes / preserves top some percent of features

    This feature selection algorithm looks only at the features (X), not the
    desired outputs (y), and can thus be used for unsupervised learning.

    Parameters
    ----------
    stat: {'mean', 'var'}
        Kind of statistic to be computed out of the feature.

    use_log: bool, optional, default: False
        Whether to use the logarithm of feature characteristic instead of the
        characteristic itself. This may improve feature filtering performance,
        depending on the distribution of features, however all the
        characteristics (mean, variance) have to be positive for that -
        filtering will fail otherwise. This is useful for specific cases in
        biology where the distribution of data may actually require this option
        for any efficient filtering.

    keep_top: bool, optional, default: True
        When True, keeps features with highest value of the characteristic.

    p: float, optional, default: 0.2
        Rate of features to keep.

    Attributes
    ----------
    vals_: array, shape (n_features,)
        Computed characteristic of each feature.

    threshold_: float
        Value of the threshold used for filtering

    selected_: array, shape (n_features,)
        Vector of binary selections of the informative features.
    """

    def __init__(
        self, stat: str, use_log: bool = False, keep_top: bool = True, p: float = 0.2
    ):
        self.stat = stat
        self.use_log = use_log
        self.keep_top = keep_top
        self.p = p

[docs]    def fit(self, X, y=None):
        """Learn data-driven feature thresholds from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute feature characteristic.

        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        self
        """
        self.vals_ = self._to_characteristics(X)
        if self.keep_top:
            self.threshold_ = np.quantile(self.vals_, q=1 - self.p)
            self.selected_ = self.threshold_ <= self.vals_
        else:
            self.threshold_ = np.quantile(self.vals_, q=self.p)
            self.selected_ = self.threshold_ >= self.vals_
        return self