"""Implementation of PCA with data-driven variance explanation limit"""
import warnings
import numpy as np
from kneed import KneeLocator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from divik.core import configurable
def knee(explained_variance) -> int:
"""Find empirical knee point for explained variance"""
xaxis = np.arange(explained_variance.size, dtype=int)
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
limit = KneeLocator(
x=xaxis,
y=explained_variance,
S=1.0,
direction="increasing",
curve="concave",
).knee
except IndexError: # This is needed for kneed >= 0.5.3
limit = None
if limit is not None:
return limit
return explained_variance.size
[docs]@configurable
class KneePCA(BaseEstimator, TransformerMixin):
"""Principal component analysis (PCA) with knee method
PCA with automated components selection based on knee method
over cumulative explained variance. Remaining components are
discarded.
Parameters
-----------
whiten : bool, optional (default False)
When True (False by default) the ``pca_.components_`` vectors are
multiplied by the square root of n_samples and then divided by the
singular values to ensure uncorrelated outputs with unit
component-wise variances.
Whitening will remove some information from the transformed signal
(the relative variance scales of the components) but can sometime
improve the predictive accuracy of the downstream estimators by
making their data respect some hard-wired assumptions.
refit : bool, optional (default False)
When ``True`` (``False`` by default) the ``pca_`` is re-fit with the smaller
number of components. This could reduce memory footprint, but
requires training fitting PCA.
Attributes
----------
pca_ : PCA
Fit PCA estimator.
n_components_ : int
The number of selected components.
"""
def __init__(self, whiten: bool = False, refit: bool = False):
self.whiten = whiten
self.refit = refit
[docs] def fit(self, X, y=None):
"""Fit the model from data in X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training vector, where ``n_samples`` is the number of samples
and ``n_features`` is the number of features.
Y: Ignored.
Returns
-------
self : object
Returns the instance itself.
"""
# Note: random_state is not used in this config!
self.pca_ = PCA(
n_components=None,
copy=True,
whiten=self.whiten,
svd_solver="full",
tol=0.0,
iterated_power="auto",
random_state=None,
).fit(X)
self.n_components_ = knee(np.cumsum(self.pca_.explained_variance_ratio_))
if self.refit:
self.pca_ = PCA(
n_components=self.n_components_,
copy=True,
whiten=self.whiten,
svd_solver="full",
tol=0.0,
iterated_power="auto",
random_state=None,
).fit(X)
return self