Source code for skdim.id._TLE

#
# BSD 3-Clause License
#
# Copyright (c) 2020, Jonathan Bac
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import numpy as np
from .._commonfuncs import LocalEstimator
from scipy.spatial.distance import pdist, squareform


[docs]class TLE(LocalEstimator): """Intrinsic dimension estimation using the Tight Local intrinsic dimensionality Estimator algorithm. [Amsaleg2019]_ [IDRadovanović]_ Parameters ---------- epsilon: float """ _N_NEIGHBORS = 20 def __init__( self, epsilon=1e-4, ): self.epsilon = epsilon def _fit(self, X, dists, knnidx): self.dimension_pw_ = np.zeros(len(X)) for i in range(len(X)): self.dimension_pw_[i] = self._idtle(X[knnidx[i, :]], dists[[i], :]) def _idtle(self, nn, dists): # nn - matrix of nearest neighbors (n_neighbors x d), sorted by distance # dists - nearest-neighbor distances (1 x n_neighbors), sorted r = dists[0, -1] # distance to n_neighbors-th neighbor # Boundary case 1: If $r = 0$, this is fatal, since the neighborhood would be degenerate. if r == 0: raise ValueError("All k-NN distances are zero!") # Main computation n_neighbors = dists.shape[1] V = squareform(pdist(nn)) Di = np.tile(dists.T, (1, n_neighbors)) Dj = Di.T Z2 = 2 * Di ** 2 + 2 * Dj ** 2 - V ** 2 S = ( r * ( ((Di ** 2 + V ** 2 - Dj ** 2) ** 2 + 4 * V ** 2 * (r ** 2 - Di ** 2)) ** 0.5 - (Di ** 2 + V ** 2 - Dj ** 2) ) / (2 * (r ** 2 - Di ** 2)) ) T = ( r * ( ((Di ** 2 + Z2 - Dj ** 2) ** 2 + 4 * Z2 * (r ** 2 - Di ** 2)) ** 0.5 - (Di ** 2 + Z2 - Dj ** 2) ) / (2 * (r ** 2 - Di ** 2)) ) # handle case of repeating k-NN distances Dr = (dists == r).squeeze() S[Dr, :] = r * V[Dr, :] ** 2 / (r ** 2 + V[Dr, :] ** 2 - Dj[Dr, :] ** 2) T[Dr, :] = r * Z2[Dr, :] / (r ** 2 + Z2[Dr, :] - Dj[Dr, :] ** 2) # Boundary case 2: If $u_i = 0$, then for all $1\leq j\leq n_neighbors$ the measurements $s_{ij}$ and $t_{ij}$ reduce to $u_j$. Di0 = (Di == 0).squeeze() T[Di0] = Dj[Di0] S[Di0] = Dj[Di0] # Boundary case 3: If $u_j = 0$, then for all $1\leq j\leq n_neighbors$ the measurements $s_{ij}$ and $t_{ij}$ reduce to $\frac{r v_{ij}}{r + v_{ij}}$. Dj0 = (Dj == 0).squeeze() T[Dj0] = r * V[Dj0] / (r + V[Dj0]) S[Dj0] = r * V[Dj0] / (r + V[Dj0]) # Boundary case 4: If $v_{ij} = 0$, then the measurement $s_{ij}$ is zero and must be dropped. The measurement $t_{ij}$ should be dropped as well. V0 = (V == 0).squeeze() np.fill_diagonal(V0, False) # by setting to r, $t_{ij}$ will not contribute to the sum s1t T[V0] = r # by setting to r, $s_{ij}$ will not contribute to the sum s1s S[V0] = r # will subtract twice this number during ID computation below nV0 = np.sum(V0) # Drop T & S measurements below epsilon (V4: If $s_{ij}$ is thrown out, then for the sake of balance, $t_{ij}$ should be thrown out as well (or vice versa).) TSeps = (T < self.epsilon) | (S < self.epsilon) np.fill_diagonal(TSeps, 0) nTSeps = np.sum(TSeps) T[TSeps] = r T = np.log(T / r) S[TSeps] = r S = np.log(S / r) np.fill_diagonal(T, 0) # delete diagonal elements np.fill_diagonal(S, 0) # Sum over the whole matrices s1t = np.sum(T) s1s = np.sum(S) # Drop distances below epsilon and compute sum Deps = dists < self.epsilon nDeps = np.sum(Deps, dtype=int) dists = dists[nDeps:] s2 = np.sum(np.log(dists / r)) # Compute ID, subtracting numbers of dropped measurements ID = -2 * (n_neighbors ** 2 - nTSeps - nDeps - nV0) / (s1t + s1s + 2 * s2) return ID