Source code for divik.feature_extraction._spectral

import logging

import numpy as np
from scipy.spatial import distance as dist
from sklearn.base import BaseEstimator
from sklearn.manifold import SpectralEmbedding
from sklearn.utils.validation import check_is_fitted

from divik._utils import Data


def locally_adjusted_affinity(X: Data, d: str, neighbors: int = 7) -> Data:
    """Calculate affinity with local density correction

    Calculate affinity matrix based on input coordinates matrix and the number
    of nearest neighbors.

    Apply local scaling based on the k nearest neighbor

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)
        Training instances to cluster.

    d : str
        Measure of distance between points.

    neighbors : int
        The number of neighbors considered a local neighborhood.

    Returns
    -------

    affinity : array, shape [n_samples, n_samples]
        Adjusted affinity matrix.

    References:
    ----------
    https://towardsdatascience.com/spectral-graph-clustering-and-optimal-number-of-clusters-estimation-32704189afbe
    https://papers.nips.cc/paper/2619-self-tuning-spectral-clustering.pdf

    """
    distances = dist.pdist(X, metric=d)
    knn_distances = np.sort(distances, axis=0)[neighbors].reshape(-1, 1)
    local_scale = knn_distances.dot(knn_distances.T)
    affinity = - distances ** 2 / local_scale
    affinity[np.isnan(affinity)] = 0
    affinity = np.exp(affinity)
    np.fill_diagonal(affinity, 0)
    return affinity


[docs]class LocallyAdjustedRbfSpectralEmbedding(BaseEstimator):
    """Spectral embedding for non-linear dimensionality reduction.

    Forms an affinity matrix given by the specified function and
    applies spectral decomposition to the corresponding graph laplacian.
    The resulting transformation is given by the value of the
    eigenvectors for each data point.

    Note : Laplacian Eigenmaps is the actual algorithm implemented here.

    Parameters
    -----------
    distance : {'braycurtis', 'canberra', 'chebyshev', 'cityblock',
    'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard',
    'kulsinski', 'mahalanobis', 'atching', 'minkowski', 'rogerstanimoto',
    'russellrao', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'}
        Distance measure, defaults to 'euclidean'. These are the distances
        supported by scipy package.


    n_components : integer, default: 2
        The dimension of the projected subspace.

    random_state : int, RandomState instance or None, optional, default: None
        A pseudo random number generator used for the initialization of the
        lobpcg eigenvectors.  If int, random_state is the seed used by the
        random number generator; If RandomState instance, random_state is the
        random number generator; If None, the random number generator is the
        RandomState instance used by `np.random`. Used when ``solver`` ==
        'amg'.

    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
        The eigenvalue decomposition strategy to use. AMG requires pyamg
        to be installed. It can be faster on very large, sparse problems,
        but may also lead to instabilities.

    n_neighbors : int, default : max(n_samples/10 , 1)
        Number of nearest neighbors for nearest_neighbors graph building.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run.
        If ``-1``, then the number of jobs is set to the number of CPU cores.

    Attributes
    ----------

    embedding_ : array, shape = (n_samples, n_components)
        Spectral embedding of the training matrix.

    References
    ----------

    - A Tutorial on Spectral Clustering, 2007
      Ulrike von Luxburg
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323

    - On Spectral Clustering: Analysis and an algorithm, 2001
      Andrew Y. Ng, Michael I. Jordan, Yair Weiss
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100

    - Normalized cuts and image segmentation, 2000
      Jianbo Shi, Jitendra Malik
      http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
    """
    def __init__(self, distance: str = 'euclidean', n_components=2,
                 random_state=None, eigen_solver: str = None,
                 n_neighbors: int = None, n_jobs: int = 1):
        self.distance = distance
        self.n_components = n_components
        self.random_state = random_state
        self.eigen_solver = eigen_solver
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs

    # noinspection PyAttributeOutsideInit
[docs]    def fit(self, X, y=None):
        """Fit the model from data in X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples
            and n_features is the number of features.

        Y: Ignored.

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        logging.debug('Computing locally adjusted affinities.')
        affinity_matrix_ = locally_adjusted_affinity(
            X, self.distance, self.n_neighbors)

        logging.debug('Computing embedding of affinities.')
        embedder = SpectralEmbedding(n_components=self.n_components,
                                     affinity='precomputed',
                                     gamma=None,
                                     random_state=self.random_state,
                                     eigen_solver=self.eigen_solver,
                                     n_neighbors=self.n_neighbors,
                                     n_jobs=self.n_jobs)
        self.embedding_ = embedder.fit_transform(affinity_matrix_)
        return self

[docs]    def fit_transform(self, X, y=None):
        """Fit the model from data in X and transform X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples
            and n_features is the number of features.

        Y: Ignored.

        Returns
        -------
        X_new : array-like, shape (n_samples, n_components)
        """
        return self.fit(X).embedding_

[docs]    def save(self, destination: str):
        """Save embedding to a directory

        Parameters
        ----------
        destination : str
            Directory to save the embedding.
        """
        logging.info('Saving embedding to {0}.'.format(destination))
        check_is_fitted(self, 'embedding_')
        from functools import partial
        import os
        import pickle
        import divik._cli._utils as scr
        fname = partial(os.path.join, destination)

        logging.debug('Saving model.')
        with open(fname('model.pkl'), 'wb') as pkl:
            pickle.dump(self, pkl)

        logging.debug('Saving embedding.')
        scr.save_csv(self.embedding_, fname('embedding.csv'))
        np.save(fname('embedding.npy'), self.embedding_)