Source code for diffprivlib.models.k_means

# MIT License
#
# Copyright (C) IBM Corporation 2019
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
K-means clustering algorithm satisfying differential privacy.
"""
import warnings

import numpy as np
import sklearn.cluster as sk_cluster

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricFolded
from diffprivlib.utils import PrivacyLeakWarning, check_random_state
from diffprivlib.validation import DiffprivlibMixin


[docs] class KMeans(sk_cluster.KMeans, DiffprivlibMixin): r"""K-Means clustering with differential privacy. Implements the DPLloyd approach presented in [SCL16]_, leveraging the :class:`sklearn.cluster.KMeans` class for full integration with Scikit Learn. Parameters ---------- n_clusters : int, default: 8 The number of clusters to form as well as the number of centroids to generate. epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. bounds : tuple, optional Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. random_state : int or RandomState, optional Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, ``random_state`` has to be fixed to an integer. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. Attributes ---------- cluster_centers_ : array, [n_clusters, n_features] Coordinates of cluster centers. If the algorithm stops before fully converging, these will not be consistent with ``labels_``. labels_ : Labels of each point inertia_ : float Sum of squared distances of samples to their closest cluster center. n_iter_ : int Number of iterations run. References ---------- .. [SCL16] Su, Dong, Jianneng Cao, Ninghui Li, Elisa Bertino, and Hongxia Jin. "Differentially private k-means clustering." In Proceedings of the sixth ACM conference on data and application security and privacy, pp. 26-37. ACM, 2016. """ _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( sk_cluster.KMeans, "n_clusters", "random_state") def __init__(self, n_clusters=8, *, epsilon=1.0, bounds=None, random_state=None, accountant=None, **unused_args): super().__init__(n_clusters=n_clusters, random_state=random_state) self.epsilon = epsilon self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) self._warn_unused_args(unused_args) self.cluster_centers_ = None self.bounds_processed = None self.labels_ = None self.inertia_ = None self.n_iter_ = None self._n_threads = 1
[docs] def fit(self, X, y=None, sample_weight=None): """Computes k-means clustering with differential privacy. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training instances to cluster. y : Ignored not used, present here for API consistency by convention. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : class """ self._validate_params() self.accountant.check(self.epsilon, 0) if sample_weight is not None: self._warn_unused_args("sample_weight") del y random_state = check_random_state(self.random_state) X = self._validate_data(X, accept_sparse=False, dtype=[np.float64, np.float32]) n_samples, n_dims = X.shape if n_samples < self.n_clusters: raise ValueError(f"n_samples={n_samples} should be >= n_clusters={self.n_clusters}") iters = self._calc_iters(n_dims, n_samples) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = self._check_bounds(self.bounds, n_dims, min_separation=1e-5) X = self._clip_to_bounds(X, self.bounds) centers = self._init_centers(n_dims, random_state=random_state) labels = None distances = None # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely for _ in range(-1, iters): if labels is not None: centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters, random_state=random_state) distances, labels = self._distances_labels(X, centers) self.cluster_centers_ = centers self.labels_ = labels self.inertia_ = distances[np.arange(len(labels)), labels].sum() self.n_iter_ = iters self.accountant.spend(self.epsilon, 0) return self
def _init_centers(self, dims, random_state): if self.bounds_processed is None: bounds_processed = np.zeros(shape=(dims, 2)) for dim in range(dims): lower = self.bounds[0][dim] upper = self.bounds[1][dim] bounds_processed[dim, :] = [upper - lower, lower] self.bounds_processed = bounds_processed cluster_proximity = np.min(self.bounds_processed[:, 0]) / 2.0 while cluster_proximity > 0: centers = np.zeros(shape=(self.n_clusters, dims)) cluster, retry = 0, 0 while retry < 100: if cluster >= self.n_clusters: break temp_center = random_state.random(dims) * (self.bounds_processed[:, 0] - 2 * cluster_proximity) + \ self.bounds_processed[:, 1] + cluster_proximity if cluster == 0: centers[0, :] = temp_center cluster += 1 continue min_distance = ((centers[:cluster, :] - temp_center) ** 2).sum(axis=1).min() if np.sqrt(min_distance) >= 2 * cluster_proximity: centers[cluster, :] = temp_center cluster += 1 retry = 0 else: retry += 1 if cluster >= self.n_clusters: return centers cluster_proximity /= 2.0 return None def _distances_labels(self, X, centers): distances = np.zeros((X.shape[0], self.n_clusters)) for cluster in range(self.n_clusters): distances[:, cluster] = ((X - centers[cluster, :]) ** 2).sum(axis=1) labels = np.argmin(distances, axis=1) return distances, labels def _update_centers(self, X, centers, labels, dims, total_iters, random_state): """Updates the centers of the KMeans algorithm for the current iteration, while satisfying differential privacy. Differential privacy is satisfied by adding (integer-valued, using :class:`.GeometricFolded`) random noise to the count of nearest neighbours to the previous cluster centers, and adding (real-valued, using :class:`.LaplaceBoundedDomain`) random noise to the sum of values per dimension. """ epsilon_0, epsilon_i = self._split_epsilon(dims, total_iters) geometric_mech = GeometricFolded(epsilon=epsilon_0, sensitivity=1, lower=0.5, upper=float("inf"), random_state=random_state) for cluster in range(self.n_clusters): if cluster not in labels: continue cluster_count = sum(labels == cluster) noisy_count = geometric_mech.randomise(cluster_count) cluster_sum = np.sum(X[labels == cluster], axis=0) noisy_sum = np.zeros_like(cluster_sum) for i in range(dims): laplace_mech = LaplaceBoundedDomain(epsilon=epsilon_i, sensitivity=self.bounds[1][i] - self.bounds[0][i], lower=noisy_count * self.bounds[0][i], upper=noisy_count * self.bounds[1][i], random_state=random_state) noisy_sum[i] = laplace_mech.randomise(cluster_sum[i]) centers[cluster, :] = noisy_sum / noisy_count return centers def _split_epsilon(self, dims, total_iters, rho=0.225): """Split epsilon between sum perturbation and count perturbation, as proposed by Su et al. Parameters ---------- dims : int Number of dimensions to split `epsilon` across. total_iters : int Total number of iterations to split `epsilon` across. rho : float, default: 0.225 Coordinate normalisation factor. Returns ------- epsilon_0 : float The epsilon value for satisfying differential privacy on the count of a cluster. epsilon_i : float The epsilon value for satisfying differential privacy on each dimension of the center of a cluster. """ epsilon_i = 1 epsilon_0 = np.cbrt(4 * dims * rho ** 2) normaliser = self.epsilon / total_iters / (epsilon_i * dims + epsilon_0) return epsilon_i * normaliser, epsilon_0 * normaliser def _calc_iters(self, n_dims, n_samples, rho=0.225): """Calculate the number of iterations to allow for the KMeans algorithm.""" epsilon_m = np.sqrt(500 * (self.n_clusters ** 3) / (n_samples ** 2) * (n_dims + np.cbrt(4 * n_dims * (rho ** 2))) ** 3) iters = max(min(self.epsilon / epsilon_m, 7), 2) return int(iters)