Source code for skdim.datasets

#
# BSD 3-Clause License
#
# Copyright (c) 2020, Jonathan Bac
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import numpy as np
import pandas as pd
from sklearn.utils.validation import check_random_state
from scipy.special import gammainc


[docs]def hyperBall(n, d, radius=1.0, center=[], random_state=None): """ Generates a sample from a uniform distribution on the hyperball Parameters ---------- n: int Number of data points. d: int Dimension of the hyperball radius: float Radius of the hyperball center: list, tuple, np.array Center of the hyperball random_state: int, np.random.RandomState instance Random number generator Returns ------- data: np.array, (npoints x ndim) Generated data """ random_state_ = check_random_state(random_state) if center == []: center = np.array([0] * d) r = radius x = random_state_.normal(size=(n, d)) ssq = np.sum(x ** 2, axis=1) fr = r * gammainc(d / 2, ssq / 2) ** (1 / d) / np.sqrt(ssq) frtiled = np.tile(fr.reshape(n, 1), (1, d)) p = center + np.multiply(x, frtiled) return p
[docs]def hyperSphere(n, d, random_state=None): """ Generates a sample from a uniform distribution on the hypersphere Parameters ---------- n: int Number of data points. d: int Dimension of the hypersphere center: list, tuple, np.array Center of the hypersphere random_state: int, np.random.RandomState instance Random number generator Returns ------- data: np.array, (npoints x ndim) Generated data """ random_state = check_random_state(random_state) vec = random_state.randn(n, d) vec /= np.linalg.norm(vec, axis=1)[:, None] return vec
[docs]def hyperTwinPeaks(n, d=2, height=1.0, random_state=None): """ Generates a sample from a plane with protruding peaks. Translated from Kerstin Johnsson's R package intrinsicDimension Parameters ---------- n: int Number of data points. d: int Dimension of the dataset height: float Height of the peaks random_state: int, np.random.RandomState instance Random number generator Returns ------- data: np.array, (npoints x ndim) Generated data """ random_state = check_random_state(random_state) base_coord = random_state.uniform(size=(n, d)) _height = height * np.prod(np.sin(2 * np.pi * base_coord), axis=1, keepdims=1) return np.hstack((base_coord, _height))
[docs]def lineDiskBall(n, random_state=None): """ Generates a sample from a uniform distribution on a line, an oblong disk and an oblong ball Translated from ldbl function in Hideitsu Hino's package Parameters ---------- n: int Number of data points. random_state: int, np.random.RandomState instance Random number generator Returns ------- data: np.array, (npoints x ndim) Generated data """ random_state = check_random_state(random_state) line = np.hstack( ( np.repeat(0, 5 * n)[:, None], np.repeat(0, 5 * n)[:, None], random_state.uniform(-0.5, 0, size=5 * n)[:, None], ) ) disc = np.hstack( (random_state.uniform(-1, 1, (13 * n, 2)), np.zeros(13 * n)[:, None],) ) disc = disc[~(np.sqrt(np.sum(disc ** 2, axis=1)) > 1), :] disc = disc[:, [0, 2, 1]] disc[:, 2] = disc[:, 2] - min(disc[:, 2]) + max(line[:, 2]) fb = random_state.uniform(-0.5, 0.5, size=(n * 100, 3)) rmID = np.where(np.sqrt(np.sum(fb ** 2, axis=1)) > 0.5)[0] if len(rmID) > 0: fb = fb[~(np.sqrt(np.sum(fb ** 2, axis=1)) > 0.5), :] fb = np.hstack((fb[:, :2], fb[:, [2]] + 0.5)) fb[:, 2] = fb[:, 2] - min(fb[:, 2]) + max(disc[:, 2]) # if _sorted: # fb = fb[order(fb[:, 2]),:] line2 = np.hstack( ( np.repeat(0, 5 * n)[:, None], np.repeat(0, 5 * n)[:, None], random_state.uniform(-0.5, 0, size=5 * n)[:, None], ) ) line2[:, 2] = line2[:, 2] - min(line2[:, 2]) + max(fb[:, 2]) lineID = np.repeat(1, len(line)) discID = np.repeat(2, len(disc)) fbID = np.repeat(3, len(fb)) line2ID = np.repeat(1, len(line2)) x = np.vstack((line, disc, fb, line2)) useID = np.sort(random_state.choice(len(x), n, replace=False)) x = x[useID, :] return x, np.concatenate((lineID, discID, fbID, line2ID), axis=0)[useID]
[docs]def swissRoll3Sph(n_swiss, n_sphere, a=1, b=2, nturn=1.5, h=4, random_state=None): """ Generates a sample from a uniform distribution on a Swiss roll-surface, possibly together with a sample from a uniform distribution on a 3-sphere inside the Swiss roll. Translated from Kerstin Johnsson's R package intrinsicDimension Parameters ---------- n_swiss: int Number of data points on the Swiss roll. n_sphere: int Number of data points on the 3-sphere. a: int or float, default=1 Minimal radius of Swiss roll and radius of 3-sphere. b: int or float, default=2 Maximal radius of Swiss roll. nturn: int or float, default=1.5 Number of turns of the surface. h: int or float, default=4 Height of Swiss roll. Returns ------- data: np.array, (npoints x ndim) Generated data """ random_state = check_random_state(random_state) if n_swiss > 0: omega = 2 * np.pi * nturn def dl(r): return np.sqrt(b ** 2 + omega ** 2 * (a + b * r) ** 2) ok = np.zeros(1) while sum(ok) < n_swiss: r_samp = random_state.uniform(size=3 * n_swiss) ok = random_state.uniform(size=3 * n_swiss) < dl(r_samp) / dl(1) r_samp = r_samp[ok][:n_swiss] x = (a + b * r_samp) * np.cos(omega * r_samp) y = (a + b * r_samp) * np.sin(omega * r_samp) z = random_state.uniform(-h, h, size=n_swiss) w = np.zeros(n_swiss) else: x = y = z = w = np.array([]) if n_sphere > 0: sph = hyperSphere(n_sphere, 4, random_state=random_state) * a x = np.concatenate((x, sph[:, 0])) y = np.concatenate((y, sph[:, 1])) z = np.concatenate((z, sph[:, 2])) w = np.concatenate((w, sph[:, 3])) return np.hstack((x[:, None], y[:, None], z[:, None], w[:, None]))
[docs]class BenchmarkManifolds: """ Generates a commonly used benchmark set of synthetic manifolds with known intrinsic dimension described by Hein et al. and Campadelli et al. [Campadelli2015]_ Parameters ---------- noise_type : str, 'uniform' or 'gaussian' Type of noise to generate """ # class modified and adapted from https://github.com/stat-ml/GeoMLE # Original licence citation: # MIT License # # Copyright (c) 2019 Mokrov Nikita, Marina Gomtsyan, Maxim Panov and Yury Yanovich # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. def __init__(self, random_state: int = None, noise_type: str = "uniform"): self.random_state = check_random_state(random_state) self.noise_type = noise_type self._dict_truth = { "M1_Sphere": (10, 11, "10D sphere linearly embedded"), "M2_Affine_3to5": (3, 5, "Affine space"), "M3_Nonlinear_4to6": ( 4, 6, "Concentrated figure, mistakable with a 3D one", ), "M4_Nonlinear": (4, 8, "Nonlinear manifold"), "M5a_Helix1d": (1, 3, "1D helix"), "M5b_Helix2d": (2, 3, "2D helix"), "M6_Nonlinear": (6, 36, "Nonlinear manifold"), "M7_Roll": (2, 3, "Swiss Roll"), "M8_Nonlinear": (12, 72, "Nonlinear (highly curved) manifold"), "M9_Affine": (20, 20, "Affine space"), "M10a_Cubic": (10, 11, "10D hypercube"), "M10b_Cubic": (17, 18, "17D hypercube"), "M10c_Cubic": (24, 25, "24D hypercube"), "M10d_Cubic": (70, 71, "70D hypercube"), "M11_Moebius": (2, 3, "Möebius band 10-times twisted"), "M12_Norm": (20, 20, "Isotropic multivariate Gaussian"), "M13a_Scurve": (2, 3, "2D S-curve"), "M13b_Spiral": (1, 13, "1D helix curve"), "Mbeta": ( 10, 40, "Manifold generated with a smooth nonuniform pdf (see paper for description)", ), "Mn1_Nonlinear": ( 18, 72, "Nonlinearly embedded manifold of high ID (see paper for description)", ), "Mn2_Nonlinear": ( 24, 96, "Nonlinearly embedded manifold of high ID (see paper for description)", ), "Mp1_Paraboloid": ( 3, 12, "3D paraboloid, nonlinearly embedded in (3(3+1))D space, according to a multivariate Burr distribution (alpha=1)", ), "Mp2_Paraboloid": ( 6, 21, "6D paraboloid, nonlinearly embedded in (3*(6+1))D space, according to a multivariate Burr distribution (alpha=1)", ), "Mp3_Paraboloid": ( 9, 30, "9D paraboloid, nonlinearly embedded in (3*(9+1))D space, according to a multivariate Burr distribution (alpha=1)", ), } self.truth = pd.DataFrame( self._dict_truth, index=["Intrinsic Dimension", "Number of variables", "Description"], ).T self.dict_gen = { # synthetic data "M1_Sphere": self._gen_sphere_data, "M2_Affine_3to5": self._gen_affine3_5_data, "M3_Nonlinear_4to6": self._gen_nonlinear4_6_data, "M4_Nonlinear": self._gen_nonlinear_data, "M5a_Helix1d": self._gen_helix1_data, "M5b_Helix2d": self._gen_helix2_data, "M6_Nonlinear": self._gen_nonlinear_data, "M7_Roll": self._gen_roll_data, "M8_Nonlinear": self._gen_nonlinear_data, "M9_Affine": self._gen_affine_data, "M10a_Cubic": self._gen_cubic_data, "M10b_Cubic": self._gen_cubic_data, "M10c_Cubic": self._gen_cubic_data, "M10d_Cubic": self._gen_cubic_data, "M11_Moebius": self._gen_moebius_data, "M12_Norm": self._gen_norm_data, "M13a_Scurve": self._gen_scurve_data, "M13b_Spiral": self._gen_spiral_data, "Mbeta": self._gen_campadelli_beta_data, "Mn1_Nonlinear": self._gen_campadelli_n_data, "Mn2_Nonlinear": self._gen_campadelli_n_data, "Mp1_Paraboloid": self._gen_paraboloid_data, "Mp2_Paraboloid": self._gen_paraboloid_data, "Mp3_Paraboloid": self._gen_paraboloid_data, }
[docs] def generate( self, name: str = "all", n: int = 2500, dim: int = None, d: int = None, noise: float = 0.0, ): """ Generates all datasets. A ground truth dict of intrinsic dimension and embedding dimension is in BenchmarkManifolds.dict_truth.keys() Parameters ---------- n: int The number of sample points dim: int If generating a single dataset, choose the embedding dimension. Note that some datasets have restrictions on the chosen embedding dimension d: int If generating a single dataset, choose the intrinsic dimension. Note that some datasets have restrictions on the chosen intrinsic dimension noise: float, optional(default=0.0) The value of noise in data Returns ------- data: a dict of np.arrays or a single np.array with shape (n, dim) Generated data """ if self.noise_type == "normal": self.gen_noise = ( lambda n, dim, noise: (self.random_state.randn(n, dim)) * noise ) if self.noise_type == "uniform": self.gen_noise = ( lambda n, dim, noise: (self.random_state.rand(n, dim) - 0.5) * noise ) dict_data = {} if name == "all": for k, (d, dim, _) in self._dict_truth.items(): data = self.dict_gen[k](n=n, dim=dim, d=d) dict_data[k] = data + self.gen_noise(n, dim, noise) return dict_data elif name in self._dict_truth.keys(): if dim is None and d is None: data = self.dict_gen[name](n=n) elif dim is None: data = self.dict_gen[name](n=n, d=d) elif d is None: data = self.dict_gen[name](n=n, dim=dim) data = self.dict_gen[name](n=n, dim=dim, d=d) return data + self.gen_noise(n, dim, noise)
def _gen_spiral_data(self, n, dim=3, d=1): assert d < dim assert d == 1 assert dim >= 3 t = 10 * np.pi * self.random_state.rand(n) data = np.vstack( [100 * np.cos(t), 100 * np.sin(t), t, np.zeros((dim - 3, n))] ).T assert data.shape == (n, dim) return data def _gen_helix1_data(self, n, dim=3, d=1): assert d < dim assert d == 1 assert dim >= 3 t = 2 * np.pi / n + self.random_state.rand(n) * 2 * np.pi data = np.vstack( [ (2 + np.cos(8 * t)) * np.cos(t), (2 + np.cos(8 * t)) * np.sin(t), np.sin(8 * t), np.zeros((dim - 3, n)), ] ).T assert data.shape == (n, dim) return data def _gen_helix2_data(self, n, dim=3, d=2): assert d < dim assert d == 2 assert dim >= 3 r = 10 * np.pi * self.random_state.rand(n) p = 10 * np.pi * self.random_state.rand(n) data = np.vstack( [r * np.cos(p), r * np.sin(p), 0.5 * p, np.zeros((dim - 3, n))] ).T assert data.shape == (n, dim) return data def _gen_helicoid_data(self, n, dim=3, d=2): assert d <= dim assert d == 2 assert dim >= 3 u = 2 * np.pi / n + self.random_state.rand(n) * 2 * np.pi v = 5 * np.pi * self.random_state.rand(n) data = np.vstack( [np.cos(v), np.sin(v) * np.cos(v), u, np.zeros((dim - 3, n))] ).T assert data.shape == (n, dim) return data def _gen_roll_data(self, n, dim=3, d=2): assert d < dim assert dim >= 3 assert d == 2 t = 1.5 * np.pi * (1 + 2 * self.random_state.rand(n)) p = 21 * self.random_state.rand(n) data = np.vstack( [t * np.cos(t), p, t * np.sin(t), np.zeros((dim - d - 1, n))] ).T assert data.shape == (n, dim) return data def _gen_scurve_data(self, n, dim=3, d=2): assert d < dim assert dim >= 3 assert d == 2 t = 3 * np.pi * (self.random_state.rand(n) - 0.5) p = 2.0 * self.random_state.rand(n) data = np.vstack( [np.sin(t), p, np.sign(t) * (np.cos(t) - 1), np.zeros((dim - d - 1, n)),] ).T assert data.shape == (n, dim) return data def _gen_sphere_data(self, n, dim, d): assert d < dim V = self.random_state.randn(n, d + 1) data = np.hstack( [V / np.sqrt((V ** 2).sum(axis=1))[:, None], np.zeros((n, dim - d - 1)),] ) assert data.shape == (n, dim) return data def _gen_norm_data(self, n, dim, d): assert d <= dim norm_xyz = self.random_state.multivariate_normal(np.zeros(d), np.identity(d), n) data = np.hstack([norm_xyz, np.zeros((n, dim - d))]) assert data.shape == (n, dim) return data def _gen_uniform_data(self, n, dim, d): assert d <= dim uniform_xyz = self.random_state.uniform(size=(n, d)) data = np.hstack([uniform_xyz, np.zeros((n, dim - d))]) assert data.shape == (n, dim) return data def _gen_cubic_data(self, n, dim, d): assert d < dim cubic_data = np.array([[]] * (d + 1)) for i in range(d + 1): n_once = int(n / (2 * (d + 1)) + 1) # 1st side data_once = self.random_state.rand(d + 1, n_once) data_once[i] = 0 cubic_data = np.hstack([cubic_data, data_once]) # 2nd side data_once = self.random_state.rand(d + 1, n_once) data_once[i] = 1 cubic_data = np.hstack([cubic_data, data_once]) cubic_data = cubic_data.T[:n] data = np.hstack([cubic_data, np.zeros((n, dim - d - 1))]) assert data.shape == (n, dim) return data def _gen_moebius_data(self, n, dim=3, d=2): assert dim == 3 assert d == 2 phi = self.random_state.rand(n) * 2 * np.pi rad = self.random_state.rand(n) * 2 - 1 data = np.vstack( [ (1 + 0.5 * rad * np.cos(5.0 * phi)) * np.cos(phi), (1 + 0.5 * rad * np.cos(5.0 * phi)) * np.sin(phi), 0.5 * rad * np.sin(5.0 * phi), ] ).T assert data.shape == (n, dim) return data def _gen_affine_data(self, n, dim, d): assert dim >= d p = self.random_state.rand(d, n) * 5 - 2.5 v = np.eye(dim, d) # v = np.random.randint(0, 10, (dim, d)) data = v.dot(p).T assert data.shape == (n, dim) return data def _gen_affine3_5_data(self, n, dim=5, d=3): assert dim == 5 assert d == 3 p = 4 * self.random_state.rand(d, n) A = np.array( [ [1.2, -0.5, 0], [0.5, 0.9, 0], [-0.5, -0.2, 1], [0.4, -0.9, -0.1], [1.1, -0.3, 0], ] ) b = np.array([[3, -1, 0, 0, 8]]).T data = A.dot(p) + b data = data.T assert data.shape == (n, dim) return data def _gen_nonlinear4_6_data(self, n, dim=6, d=4): assert dim == 6 assert d == 4 p0, p1, p2, p3 = self.random_state.rand(d, n) data = np.vstack( [ p1 ** 2 * np.cos(2 * np.pi * p0), p2 ** 2 * np.sin(2 * np.pi * p0), p1 + p2 + (p1 - p3) ** 2, p1 - 2 * p2 + (p0 - p3) ** 2, -p1 - 2 * p2 + (p2 - p3) ** 2, p0 ** 2 - p1 ** 2 + p2 ** 2 - p3 ** 2, ] ).T assert data.shape == (n, dim) return data def _gen_nonlinear_data(self, n, dim, d): assert dim >= d m = int(dim / (2 * d)) assert dim == 2 * m * d p = self.random_state.rand(d, n) F = np.zeros((2 * d, n)) F[0::2, :] = np.cos(2 * np.pi * p) F[1::2, :] = np.sin(2 * np.pi * p) R = np.zeros((2 * d, n)) R[0::2, :] = np.vstack([p[1:], p[0]]) R[1::2, :] = np.vstack([p[1:], p[0]]) D = (R * F).T data = np.hstack([D] * m) assert data.shape == (n, dim) return data def _gen_paraboloid_data(self, n, dim, d): assert dim == 3 * (d + 1) E = self.random_state.exponential(1, (d + 1, n)) X = ((1 + E[1:] / E[0]) ** -1).T X = np.hstack([X, (X ** 2).sum(axis=1)[:, np.newaxis]]) data = np.hstack([X, np.sin(X), X ** 2]) assert data.shape == (n, dim) return data def _gen_campadelli_n_data(self, n, dim, d): assert dim == d * 4 # Generate points drawn from a uniform distribution X = self.random_state.random(size=(n, d)) temp1 = np.zeros((n, d)) temp2 = np.zeros((n, d)) # Extend the embeddingDity: for i in range(d): temp1[:, i] = np.tan(X[:, i] * np.cos(X[:, d - 1 - i])) temp2[:, i] = np.arctan(X[:, d - 1 - i] * np.sin(X[:, i])) # Create the final dataset: data = np.concatenate([temp1, temp2, temp1, temp2], axis=1) return data def _gen_campadelli_beta_data(self, n, dim, d, alpha=10, beta=0.5): assert dim == d * 4 # Function to generate point drawn from a beta distribution X = self.random_state.beta(a=alpha, b=beta, size=((n, d))) temp1 = np.zeros((n, d)) temp2 = np.zeros((n, d)) # Extend the embeddingDity: for i in range(d): temp1[:, i] = X[:, i] * np.sin(np.cos(2 * np.pi * (X[:, i]))) temp2[:, i] = X[:, i] * np.cos(np.sin(2 * np.pi * (X[:, i]))) # Create the final dataset: data = np.concatenate([temp1, temp2, temp1, temp2], axis=1) return data