diff --git a/DBCV/DBCV.py b/DBCV/DBCV.py deleted file mode 100644 index 5974b1e..0000000 --- a/DBCV/DBCV.py +++ /dev/null @@ -1,259 +0,0 @@ -""" -Implimentation of Density-Based Clustering Validation "DBCV" - -Citation: -Moulavi, Davoud, et al. "Density-based clustering validation." -Proceedings of the 2014 SIAM International Conference on Data Mining. -Society for Industrial and Applied Mathematics, 2014. -""" - -import numpy as np -from scipy.spatial.distance import euclidean, cdist -from scipy.sparse.csgraph import minimum_spanning_tree -from scipy.sparse import csgraph - - -def DBCV(X, labels, dist_function=euclidean): - """ - Density Based clustering validation - - Args: - X (np.ndarray): ndarray with dimensions [n_samples, n_features] - data to check validity of clustering - labels (np.array): clustering assignments for data X - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: cluster_validity (float) - score in range[-1, 1] indicating validity of clustering assignments - """ - graph = _mutual_reach_dist_graph(X, labels, dist_function) - mst = _mutual_reach_dist_MST(graph) - cluster_validity = _clustering_validity_index(mst, labels) - return cluster_validity - - -def _core_dist(point, neighbors, dist_function): - """ - Computes the core distance of a point. - Core distance is the inverse density of an object. - - Args: - point (np.array): array of dimensions (n_features,) - point to compute core distance of - neighbors (np.ndarray): array of dimensions (n_neighbors, n_features): - array of all other points in object class - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: core_dist (float) - inverse density of point - """ - n_features = np.shape(point)[0] - n_neighbors = np.shape(neighbors)[0] - - distance_vector = cdist(point.reshape(1, -1), neighbors) - distance_vector = distance_vector[distance_vector != 0] - numerator = ((1/distance_vector)**n_features).sum() - core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features) - return core_dist - - -def _mutual_reachability_dist(point_i, point_j, neighbors_i, - neighbors_j, dist_function): - """. - Computes the mutual reachability distance between points - - Args: - point_i (np.array): array of dimensions (n_features,) - point i to compare to point j - point_j (np.array): array of dimensions (n_features,) - point i to compare to point i - neighbors_i (np.ndarray): array of dims (n_neighbors, n_features): - array of all other points in object class of point i - neighbors_j (np.ndarray): array of dims (n_neighbors, n_features): - array of all other points in object class of point j - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: mutual_reachability (float) - mutual reachability between points i and j - - """ - core_dist_i = _core_dist(point_i, neighbors_i, dist_function) - core_dist_j = _core_dist(point_j, neighbors_j, dist_function) - dist = dist_function(point_i, point_j) - mutual_reachability = np.max([core_dist_i, core_dist_j, dist]) - return mutual_reachability - - -def _mutual_reach_dist_graph(X, labels, dist_function): - """ - Computes the mutual reach distance complete graph. - Graph of all pair-wise mutual reachability distances between points - - Args: - X (np.ndarray): ndarray with dimensions [n_samples, n_features] - data to check validity of clustering - labels (np.array): clustering assignments for data X - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: graph (np.ndarray) - array of dimensions (n_samples, n_samples) - Graph of all pair-wise mutual reachability distances between points. - - """ - n_samples = np.shape(X)[0] - graph = [] - counter = 0 - for row in range(n_samples): - graph_row = [] - for col in range(n_samples): - point_i = X[row] - point_j = X[col] - class_i = labels[row] - class_j = labels[col] - members_i = _get_label_members(X, labels, class_i) - members_j = _get_label_members(X, labels, class_j) - dist = _mutual_reachability_dist(point_i, point_j, - members_i, members_j, - dist_function) - graph_row.append(dist) - counter += 1 - graph.append(graph_row) - graph = np.array(graph) - return graph - - -def _mutual_reach_dist_MST(dist_tree): - """ - Computes minimum spanning tree of the mutual reach distance complete graph - - Args: - dist_tree (np.ndarray): array of dimensions (n_samples, n_samples) - Graph of all pair-wise mutual reachability distances - between points. - - Returns: minimum_spanning_tree (np.ndarray) - array of dimensions (n_samples, n_samples) - minimum spanning tree of all pair-wise mutual reachability - distances between points. - """ - mst = minimum_spanning_tree(dist_tree).toarray() - return mst + np.transpose(mst) - - -def _cluster_density_sparseness(MST, labels, cluster): - """ - Computes the cluster density sparseness, the minimum density - within a cluster - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - cluster (int): cluster of interest - - Returns: cluster_density_sparseness (float) - value corresponding to the minimum density within a cluster - """ - indices = np.where(labels == cluster)[0] - cluster_MST = MST[indices][:, indices] - cluster_density_sparseness = np.max(cluster_MST) - return cluster_density_sparseness - - -def _cluster_density_separation(MST, labels, cluster_i, cluster_j): - """ - Computes the density separation between two clusters, the maximum - density between clusters. - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - cluster_i (int): cluster i of interest - cluster_j (int): cluster j of interest - - Returns: density_separation (float): - value corresponding to the maximum density between clusters - """ - indices_i = np.where(labels == cluster_i)[0] - indices_j = np.where(labels == cluster_j)[0] - shortest_paths = csgraph.dijkstra(MST, indices=indices_i) - relevant_paths = shortest_paths[:, indices_j] - density_separation = np.min(relevant_paths) - return density_separation - - -def _cluster_validity_index(MST, labels, cluster): - """ - Computes the validity of a cluster (validity of assignmnets) - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - cluster (int): cluster of interest - - Returns: cluster_validity (float) - value corresponding to the validity of cluster assignments - """ - min_density_separation = np.inf - for cluster_j in np.unique(labels): - if cluster_j != cluster: - cluster_density_separation = _cluster_density_separation(MST, - labels, - cluster, - cluster_j) - if cluster_density_separation < min_density_separation: - min_density_separation = cluster_density_separation - cluster_density_sparseness = _cluster_density_sparseness(MST, - labels, - cluster) - numerator = min_density_separation - cluster_density_sparseness - denominator = np.max([min_density_separation, cluster_density_sparseness]) - cluster_validity = numerator / denominator - return cluster_validity - - -def _clustering_validity_index(MST, labels): - """ - Computes the validity of all clustering assignments for a - clustering algorithm - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - - Returns: validity_index (float): - score in range[-1, 1] indicating validity of clustering assignments - """ - n_samples = len(labels) - validity_index = 0 - for label in np.unique(labels): - fraction = np.sum(labels == label) / float(n_samples) - cluster_validity = _cluster_validity_index(MST, labels, label) - validity_index += fraction * cluster_validity - return validity_index - - -def _get_label_members(X, labels, cluster): - """ - Helper function to get samples of a specified cluster. - - Args: - X (np.ndarray): ndarray with dimensions [n_samples, n_features] - data to check validity of clustering - labels (np.array): clustering assignments for data X - cluster (int): cluster of interest - - Returns: members (np.ndarray) - array of dimensions (n_samples, n_features) of samples of the - specified cluster. - """ - indices = np.where(labels == cluster)[0] - members = X[indices] - return members diff --git a/DBCV/__init__.py b/DBCV/__init__.py deleted file mode 100644 index 041ffc9..0000000 --- a/DBCV/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .DBCV import DBCV \ No newline at end of file diff --git a/README.md b/README.md index 262cc21..aaf4e7a 100644 --- a/README.md +++ b/README.md @@ -67,9 +67,10 @@ That's pretty good. To assess the quality of clustering, using Density-Based Cl ```python from scipy.spatial.distance import euclidean +from dbcv import get_score -kmeans_score = DBCV(X, kmeans_labels, dist_function=euclidean) -hdbscan_score = DBCV(X, hdbscan_labels, dist_function=euclidean) +kmeans_score = get_score(X, kmeans_labels, dist_function='euclidean') +hdbscan_score = get_score(X, hdbscan_labels, dist_function='euclidean') print(kmeans_score, hdbscan_score) ``` diff --git a/dbcv.py b/dbcv.py new file mode 100644 index 0000000..1727b58 --- /dev/null +++ b/dbcv.py @@ -0,0 +1,252 @@ +""" +Implimentation of Density-Based Clustering Validation "DBCV" +""" + +import numpy as np +from scipy.spatial.distance import cdist +from scipy.sparse.csgraph import minimum_spanning_tree +from scipy.sparse import csgraph +from tqdm import tqdm + + +class DBCV: + def __init__(self, samples: np.ndarray, labels: np.ndarray, dist_function: str = 'euclidean', verbose=False): + """ + Density Based clustering validation + + Args: + samples (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + """ + self.samples = samples + self.labels = labels + self.dist_function = dist_function + self.cluster_lookup = {} + self.shortest_paths = None + self.verbose = verbose + + def verbose_log(self, msg): + if self.verbose: + print(msg) + + def get_score(self): + """ + Density Based clustering validation + + Returns: cluster_validity (float) + score in range[-1, 1] indicating validity of clustering assignments + """ + graph = self._mutual_reach_dist_graph(self.samples, self.labels, self.dist_function) + self.verbose_log("made graph matrix") + mst = self._mutual_reach_dist_MST(graph) + self.verbose_log("built MST") + self.shortest_paths = csgraph.dijkstra(mst) + self.verbose_log("calculated shortest paths") + cluster_validity = self._clustering_validity_index(mst, self.labels) + self.verbose_log("scores calculated") + return cluster_validity + + def _core_dist(self, point: np.ndarray, distance_vector: np.ndarray): + """ + Computes the core distance of a point. + Core distance is the inverse density of an object. + + Args: + point (np.array): array of dimensions (n_features,) + point to compute core distance of + + distance_vector (np.array): + vector of distances from point to all other points in its cluster + + Returns: core_dist (float) + inverse density of point + """ + n_features = np.shape(point)[0] + n_neighbors = np.shape(distance_vector)[0] + + distance_vector = distance_vector[distance_vector != 0] + numerator = ((1 / distance_vector) ** n_features).sum() + core_dist = (numerator / (n_neighbors - 1)) ** (-1 / n_features) + return core_dist + + def _calculate_pairwise_distance(self, samples: np.ndarray, dist_function: str): + # TODO: align the metric with distance function + return cdist(samples, samples, metric=dist_function) + + def _mutual_reach_dist_graph(self, X, labels, dist_function): + """ + Computes the mutual reach distance complete graph. + Graph of all pair-wise mutual reachability distances between points + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: graph (np.ndarray) + array of dimensions (n_samples, n_samples) + Graph of all pair-wise mutual reachability distances between points. + + """ + n_samples = np.shape(X)[0] + + pairwise_distance = self._calculate_pairwise_distance(X, dist_function) + core_dists = [] + + for idx in tqdm(range(n_samples)): + class_label = labels[idx] + members = self._get_label_member_indices(labels, class_label) + distance_vector = pairwise_distance[idx, :][members] + core_dists.append(self._core_dist(X[idx], distance_vector)) + + # to do a bulk np.max we want to repeat core distances + core_dists = np.repeat(np.array(core_dists).reshape(-1, 1), n_samples, axis=1) + + # this matrix and its inverse show core_dist in position i,j for point i and point j respectively + core_dists_i = core_dists[:, :, np.newaxis] + core_dists_j = core_dists.T[:, :, np.newaxis] + pairwise_distance = pairwise_distance[:, :, np.newaxis] + + # concatenate all distances to compare them all in numpy + mutual_reachability_distance_matrix = np.concatenate([core_dists_i, core_dists_j, pairwise_distance], axis=-1) + graph = np.max(mutual_reachability_distance_matrix, axis=-1) + + return graph + + def _mutual_reach_dist_MST(self, dist_tree): + """ + Computes minimum spanning tree of the mutual reach distance complete graph + + Args: + dist_tree (np.ndarray): array of dimensions (n_samples, n_samples) + Graph of all pair-wise mutual reachability distances + between points. + + Returns: minimum_spanning_tree (np.ndarray) + array of dimensions (n_samples, n_samples) + minimum spanning tree of all pair-wise mutual reachability + distances between points. + """ + mst = minimum_spanning_tree(dist_tree).toarray() + return mst + np.transpose(mst) + + def _cluster_density_sparseness(self, MST, labels, cluster): + """ + Computes the cluster density sparseness, the minimum density + within a cluster + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: cluster_density_sparseness (float) + value corresponding to the minimum density within a cluster + """ + indices = np.where(labels == cluster)[0] + cluster_MST = MST[indices][:, indices] + cluster_density_sparseness = np.max(cluster_MST) + return cluster_density_sparseness + + def _cluster_density_separation(self, MST, labels, cluster_i, cluster_j): + """ + Computes the density separation between two clusters, the maximum + density between clusters. + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster_i (int): cluster i of interest + cluster_j (int): cluster j of interest + + Returns: density_separation (float): + value corresponding to the maximum density between clusters + """ + indices_i = np.where(labels == cluster_i)[0] + indices_j = np.where(labels == cluster_j)[0] + + relevant_paths = self.shortest_paths[indices_i][:, indices_j] + density_separation = np.min(relevant_paths) + return density_separation + + def _cluster_validity_index(self, MST, labels, cluster): + """ + Computes the validity of a cluster (validity of assignmnets) + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: cluster_validity (float) + value corresponding to the validity of cluster assignments + """ + min_density_separation = np.inf + for cluster_j in np.unique(labels): + if cluster_j != cluster: + cluster_density_separation = self._cluster_density_separation(MST, + labels, + cluster, + cluster_j) + if cluster_density_separation < min_density_separation: + min_density_separation = cluster_density_separation + cluster_density_sparseness = self._cluster_density_sparseness(MST, + labels, + cluster) + numerator = min_density_separation - cluster_density_sparseness + denominator = np.max([min_density_separation, cluster_density_sparseness]) + cluster_validity = numerator / denominator + return cluster_validity + + def _clustering_validity_index(self, MST, labels): + """ + Computes the validity of all clustering assignments for a + clustering algorithm + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + + Returns: validity_index (float): + score in range[-1, 1] indicating validity of clustering assignments + """ + n_samples = len(labels) + validity_index = 0 + for label in np.unique(labels): + fraction = np.sum(labels == label) / float(n_samples) + cluster_validity = self._cluster_validity_index(MST, labels, label) + validity_index += fraction * cluster_validity + return validity_index + + def _get_label_member_indices(self, labels, cluster): + """ + Helper function to get samples of a specified cluster. + + Args: + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: members (np.ndarray) + array of dimensions (n_samples,) of indices of samples of cluster + """ + if cluster in self.cluster_lookup: + return self.cluster_lookup[cluster] + + indices = np.where(labels == cluster)[0] + + self.cluster_lookup[cluster] = indices + return indices + + +def get_score(samples: np.ndarray, labels: np.ndarray, dist_function: str = 'euclidean', verbose=False): + scorer = DBCV(samples, labels, dist_function, verbose=verbose) + return scorer.get_score() diff --git a/profiling/profiler.py b/profiling/profiler.py index 5adf81f..d2c5882 100644 --- a/profiling/profiler.py +++ b/profiling/profiler.py @@ -1,22 +1,22 @@ -import DBCV +from dbcv import get_score from sklearn import datasets from sklearn.cluster import KMeans -from scipy.spatial.distance import euclidean -def generate_data(n_samples=300, noise=0.05): +def generate_data(n_samples=1000, noise=0.05): noisy_moons = datasets.make_moons(n_samples=n_samples, noise=noise) X = noisy_moons[0] return X def generate_labels(X): - kmeans = KMeans(n_clusters=2) + kmeans = KMeans(n_clusters=2) kmeans_labels = kmeans.fit_predict(X) return kmeans_labels + if __name__ == '__main__': X = generate_data() labels = generate_labels(X) - score = DBCV.DBCV(X, labels, dist_function=euclidean) - + score = get_score(X, labels, 'euclidean', verbose=True) + diff --git a/tests/noisy_data_test.py b/tests/noisy_data_test.py new file mode 100644 index 0000000..10b6c53 --- /dev/null +++ b/tests/noisy_data_test.py @@ -0,0 +1,67 @@ +import matplotlib.pyplot as plt +import numpy as np +from collections import Counter +from dbcv import DBCV +from sklearn.cluster import DBSCAN +from sklearn.metrics import silhouette_score +from scipy.spatial.distance import euclidean + +np.random.seed(5) + + +def get_data(): + noisy_points = np.random.rand(300, 2) - 0.5 + all_points = [noisy_points] + + n_clusters = 4 + cluster_size = 100 + + for _ in range(n_clusters): + cluster_center = np.random.rand(1, 2) - 0.5 + points_x = np.random.normal(loc=cluster_center[0][0], scale=0.03, size=cluster_size) + points_y = np.random.normal(loc=cluster_center[0][1], scale=0.03, size=cluster_size) + + points = np.array(list(zip(points_x, points_y))) + all_points.append(points) + + for sop in all_points: + plt.scatter(sop[:, 0], sop[:, 1]) + + plt.show() + + return all_points + + +def cluster(points): + + samples = np.concatenate(points, axis=0) + print(samples.shape) + + plt.figure(figsize=(20, 20)) + + for i, eps in enumerate([0.01, 0.05, 0.1, 0.2]): + for j, min_samples in enumerate([5, 20, 50, 100]): + + clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(samples) + cnt = Counter() + cnt.update(clustering.labels_) + + if len(cnt) < 2: + score = -1 + else: + sil_score = silhouette_score(samples, clustering.labels_) + dbcv = DBCV(samples, clustering.labels_, dist_function=euclidean) + score = dbcv.get_score() + + print(f"{eps}-{min_samples}:\ndbcv-score={score:.2f}\nsil-score:{sil_score:.2f}") + + ax = plt.subplot(4, 4, i * 4 + j + 1) + plt.scatter(samples[:, 0], samples[:, 1], c=clustering.labels_) + ax.set_title(f"{eps}-{min_samples}:\ndbcv-score={score:.2f}\nsil-score:{sil_score:.2f}") + + plt.show() + + +if __name__ == '__main__': + points = get_data() + cluster(points) \ No newline at end of file diff --git a/tests/test_dbcb.py b/tests/test_dbcb.py deleted file mode 100644 index 21ff6ad..0000000 --- a/tests/test_dbcb.py +++ /dev/null @@ -1,140 +0,0 @@ -from DBCV import DBCV -from sklearn import datasets -import pytest -from sklearn.cluster import KMeans -import hdbscan -from scipy.spatial.distance import euclidean -import numpy as np - - -@pytest.fixture -def data(): - n_samples = 60 - noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05, - random_state=1) - X = noisy_moons[0] - return X - - -def test_DBCV(data): - kmeans = KMeans(n_clusters=2) - kmeans_labels = kmeans.fit_predict(data) - hdbscanner = hdbscan.HDBSCAN() - hdbscan_labels = hdbscanner.fit_predict(data) - kmeans_score = DBCV.DBCV(data, kmeans_labels, dist_function=euclidean) - hdbscan_score = DBCV.DBCV(data, hdbscan_labels, dist_function=euclidean) - assert hdbscan_score > kmeans_score - - -def test__core_dist(data): - target = 0.09325490419185979 - point = data[0] - core_dist = DBCV._core_dist(point, data, euclidean) - assert abs(core_dist - target) < 0.001 - - -def test__mutual_reachability_dist(data): - target = 0.074196034579080888 - point_1 = data[0] - point_2 = data[1] - dist = DBCV._mutual_reachability_dist(point_1, point_2, data, data, - euclidean) - assert dist == euclidean(point_1, point_2) - point_3 = data[5] - point_4 = data[46] - dist_2 = DBCV._mutual_reachability_dist(point_3, point_4, data, data, - euclidean) - assert abs(dist_2 - target) < 0.001 - - -def test__mutual_reach_dist_graph(data): - target = 0.09872567819414102 - hdbscanner = hdbscan.HDBSCAN() - hdbscan_labels = hdbscanner.fit_predict(data) - graph = DBCV._mutual_reach_dist_graph(data, hdbscan_labels, - euclidean) - assert graph.shape == (data.shape[0], data.shape[0]) - assert abs(graph[0][0] - target < 0.001) - - -def test__mutual_reach_dist_MST(): - test_array = np.array([[0, 1, 0, 0, 0, 0], - [0, 0, 1, 0, 0, 0], - [0, 0, 0, 2, 0, 0], - [0, 0, 0, 0, 1, 0], - [0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 0, 0]]) - target_array = np.array([[0, 1, 0, 0, 0, 0], - [1, 0, 1, 0, 0, 0], - [0, 1, 0, 2, 0, 0], - [0, 0, 2, 0, 1, 0], - [0, 0, 0, 1, 0, 1], - [0, 0, 0, 0, 1, 0]]) - mst = DBCV._mutual_reach_dist_MST(test_array) - assert np.array_equal(mst, target_array) - - -def test__cluster_density_sparseness(): - test_array = np.array([[0, 1, 0, 0, 0, 0], - [1, 0, 1, 0, 0, 0], - [0, 1, 0, 2, 0, 0], - [0, 0, 2, 0, 1, 0], - [0, 0, 0, 1, 0, 1], - [0, 0, 0, 0, 1, 0]]) - labels = np.array([0, 0, 0, 1, 1, 1]) - cluster = 1 - density = DBCV._cluster_density_sparseness(test_array, - labels, cluster) - assert density == 1 - - -def test__cluster_density_separation(): - test_array = np.array([[0, 1, 0, 0, 0, 0], - [1, 0, 1, 0, 0, 0], - [0, 1, 0, 2, 0, 0], - [0, 0, 2, 0, 1, 0], - [0, 0, 0, 1, 0, 1], - [0, 0, 0, 0, 1, 0]]) - labels = np.array([0, 0, 0, 1, 1, 1]) - separation = DBCV._cluster_density_separation(test_array, - labels, 0, 1) - assert separation == 2 - - -def test__cluster_validity_index(): - test_array = np.array([[0, 1, 0, 0, 0, 0], - [1, 0, 1, 0, 0, 0], - [0, 1, 0, 2, 0, 0], - [0, 0, 2, 0, 1, 0], - [0, 0, 0, 1, 0, 1], - [0, 0, 0, 0, 1, 0]]) - labels = np.array([0, 0, 0, 1, 1, 1]) - validity = DBCV._cluster_validity_index(test_array, - labels, 0) - assert validity == 0.5 - - -def test__clustering_validity_index(): - test_array = np.array([[0, 1, 0, 0, 0, 0], - [1, 0, 1, 0, 0, 0], - [0, 1, 0, 2, 0, 0], - [0, 0, 2, 0, 1, 0], - [0, 0, 0, 1, 0, 1], - [0, 0, 0, 0, 1, 0]]) - labels = np.array([0, 0, 0, 1, 1, 1]) - validity = DBCV._clustering_validity_index(test_array, - labels) - assert validity == 0.5 - - -def test__get_label_members(): - test_array = np.array([[0, 1, 0, 0, 0, 0], - [1, 0, 1, 0, 0, 0], - [0, 1, 0, 2, 0, 0], - [0, 0, 2, 0, 1, 0], - [0, 0, 0, 1, 0, 1], - [0, 0, 0, 0, 1, 0]]) - labels = np.array([0, 0, 0, 1, 1, 1]) - members = DBCV._get_label_members(test_array, labels, 0) - target = test_array[np.array([0, 1, 2])] - assert np.array_equal(target, members)