Source code for divik.feature_extraction._spectral

import logging

import numpy as np
from scipy.spatial import distance as dist
from sklearn.base import BaseEstimator
from sklearn.manifold import SpectralEmbedding
from sklearn.utils.validation import check_is_fitted

from divik.core import configurable
from divik.core.io import save_csv


[docs]@configurable class LocallyAdjustedRbfSpectralEmbedding(BaseEstimator): """Spectral embedding for non-linear dimensionality reduction. Forms an affinity matrix given by the specified function and applies spectral decomposition to the corresponding graph laplacian. The resulting transformation is given by the value of the eigenvectors for each data point. Note : Laplacian Eigenmaps is the actual algorithm implemented here. Parameters ----------- distance : {'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'atching', 'minkowski', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'} Distance measure, defaults to ``euclidean``. These are the distances supported by scipy package. n_components : integer, default: 2 The dimension of the projected subspace. random_state : int, RandomState instance or None, optional, default: None A pseudo random number generator used for the initialization of the lobpcg eigenvectors. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by ``np.random``. Used when ``solver`` == ``amg``. eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. n_neighbors : int, default : max(n_samples/10 , 1) Number of nearest neighbors for nearest_neighbors graph building. n_jobs : int, optional (default = 1) The number of parallel jobs to run. If ``-1``, then the number of jobs is set to the number of CPU cores. Attributes ---------- embedding_ : array, shape = (n_samples, n_components) Spectral embedding of the training matrix. References ---------- - A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - On Spectral Clustering: Analysis and an algorithm, 2001 Andrew Y. Ng, Michael I. Jordan, Yair Weiss http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100 - Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 """ def __init__( self, distance: str = "euclidean", n_components=2, random_state=None, eigen_solver: str = None, n_neighbors: int = None, n_jobs: int = 1, ): self.distance = distance self.n_components = n_components self.random_state = random_state self.eigen_solver = eigen_solver self.n_neighbors = n_neighbors self.n_jobs = n_jobs # noinspection PyAttributeOutsideInit
[docs] def fit(self, X, y=None): """Fit the model from data in X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. Y: Ignored. Returns ------- self : object Returns the instance itself. """ logging.debug("Computing locally adjusted affinities.") d = dist.squareform(dist.pdist(X, metric=self.distance)) if 0 <= self.n_components <= 1: n_components = max(int(self.n_components * X.shape[1]), 1) else: n_components = self.n_components logging.debug("Computing embedding of affinities.") embedder = SpectralEmbedding( n_components=n_components, affinity="precomputed_nearest_neighbors", gamma=None, random_state=self.random_state, eigen_solver=self.eigen_solver, n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, ) self.embedding_ = embedder.fit_transform(d) return self
[docs] def fit_transform(self, X, y=None): """Fit the model from data in X and transform X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. Y: Ignored. Returns ------- X_new : array-like, shape (n_samples, n_components) """ return self.fit(X).embedding_
[docs] def transform(self, X, y=None): if not hasattr(self, "embedding_") or self.embedding_.shape[0] != X.shape[0]: self.fit(X, y) return self.embedding_
[docs] def save(self, destination: str): """Save embedding to a directory Parameters ---------- destination : str Directory to save the embedding. """ logging.info("Saving embedding to {0}.".format(destination)) check_is_fitted(self) import os import pickle from functools import partial fname = partial(os.path.join, destination) logging.debug("Saving model.") with open(fname("model.pkl"), "wb") as pkl: pickle.dump(self, pkl) logging.debug("Saving embedding.") save_csv(self.embedding_, fname("embedding.csv")) np.save(fname("embedding.npy"), self.embedding_)