Source code for diffprivlib.models.forest

# MIT License
#
# Copyright (C) IBM Corporation 2021
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Random Forest Classifier with Differential Privacy
"""
from collections import defaultdict, namedtuple
import numbers
import warnings
from joblib import Parallel, delayed
import numpy as np

from sklearn.utils import check_array
from sklearn.utils.fixes import _joblib_parallel_args
from sklearn.ensemble._forest import ForestClassifier
from sklearn.tree import DecisionTreeClassifier as BaseDecisionTreeClassifier

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.utils import PrivacyLeakWarning
from diffprivlib.mechanisms import PermuteAndFlip
from diffprivlib.validation import DiffprivlibMixin

Dataset = namedtuple('Dataset', ['X', 'y'])


[docs]class RandomForestClassifier(ForestClassifier, DiffprivlibMixin): r"""Random Forest Classifier with differential privacy. This class implements Differentially Private Random Decision Forests using Smooth Sensitivity [1]. :math:`\epsilon`-Differential privacy is achieved by constructing decision trees via random splitting criterion and applying Exponential Mechanism to produce a noisy label. Parameters ---------- n_estimators: int, default: 10 The number of trees in the forest. epsilon: float, default: 1.0 Privacy parameter :math:`\epsilon`. cat_feature_threshold: int, default: 10 Threshold value used to determine categorical features. For example, value of ``10`` means any feature that has less than or equal to 10 unique values will be treated as a categorical feature. n_jobs : int, default: 1 Number of CPU cores used when parallelising over classes. ``-1`` means using all processors. verbose : int, default: 0 Set to any positive number for verbosity. accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. max_depth: int, default: 15 The maximum depth of the tree. Final depth of the tree will be calculated based on the number of continuous and categorical features, but it wont be more than this number. Note: The depth translates to an exponential increase in memory usage. random_state: float, optional Sets the numpy random seed. feature_domains: dict, optional A dictionary of domain values for all features where keys are the feature indexes in the training data and the values are an array of domain values for categorical features and an array of min and max values for continuous features. For example, if the training data is [[2, 'dog'], [5, 'cat'], [7, 'dog']], then the feature_domains would be {'0': [2, 7], '1': ['dog', 'cat']}. If not provided, feature domains will be constructed from the data, but this will result in :class:`.PrivacyLeakWarning`. Attributes ---------- n_features_in_: int The number of features when fit is performed. n_classes_: int The number of classes. classes_: array of shape (n_classes, ) The classes labels. cat_features_: array of categorical feature indexes Categorical feature indexes. max_depth_: int Final max depth used for constructing decision trees. estimators_: list of DecisionTreeClassifier The collection of fitted sub-estimators. feature_domains_: dictionary of domain values mapped to feature indexes in the training data Examples -------- >>> from sklearn.datasets import make_classification >>> from diffprivlib.models import RandomForestClassifier >>> X, y = make_classification(n_samples=1000, n_features=4, ... n_informative=2, n_redundant=0, ... random_state=0, shuffle=False) >>> clf = RandomForestClassifier(n_estimators=100, random_state=0) >>> clf.fit(X, y) >>> print(clf.predict([[0, 0, 0, 0]])) [1] References ---------- [1] Sam Fletcher, Md Zahidul Islam. "Differentially Private Random Decision Forests using Smooth Sensitivity" https://arxiv.org/abs/1606.03572 """ def __init__(self, n_estimators=10, *, epsilon=1.0, cat_feature_threshold=10, n_jobs=1, verbose=0, accountant=None, max_depth=15, random_state=None, feature_domains=None, **unused_args): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, estimator_params=("cat_feature_threshold", "max_depth", "epsilon", "random_state"), n_jobs=n_jobs, random_state=random_state, verbose=verbose) self.epsilon = epsilon self.cat_feature_threshold = cat_feature_threshold self.max_depth = max_depth self.accountant = BudgetAccountant.load_default(accountant) self.feature_domains = feature_domains if random_state is not None: np.random.seed(random_state) self._warn_unused_args(unused_args)
[docs] def fit(self, X, y, sample_weight=None): """Fit the model to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self: class """ if sample_weight is not None: self._warn_unused_args("sample_weight") if not isinstance(self.n_estimators, numbers.Integral) or self.n_estimators < 0: raise ValueError(f'Number of estimators should be a positive integer; got {self.n_estimators}') if not isinstance(self.cat_feature_threshold, numbers.Integral) or self.cat_feature_threshold < 0: raise ValueError('Categorical feature threshold should be a positive integer;' f'got {self.cat_feature_threshold}') self.accountant.check(self.epsilon, 0) X, y = self._validate_data(X, y, multi_output=False) self.n_outputs_ = 1 self.n_features_in_ = X.shape[1] self.cat_features_ = get_cat_features(X, self.cat_feature_threshold) self.max_depth_ = calc_tree_depth(n_cont_features=self.n_features_in_-len(self.cat_features_), n_cat_features=len(self.cat_features_), max_depth=self.max_depth) self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self.feature_domains_ = self.feature_domains if self.feature_domains_ is None: warnings.warn( "`feature_domains` parameter hasn't been specified, " "so falling back to determining domains from the data.\n" "This may result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `feature_domains` according to the documentation", PrivacyLeakWarning) self.feature_domains_ = get_feature_domains(X, self.cat_features_) if len(self.feature_domains_) != self.n_features_in_: raise ValueError("Missing domains for some features in `feature_domains`") if self.n_estimators > len(X): raise ValueError('Number of estimators is more than the available samples') subset_size = int(len(X) / self.n_estimators) datasets = [] estimators = [] for i in range(self.n_estimators): estimator = DecisionTreeClassifier(max_depth=self.max_depth_, epsilon=self.epsilon, feature_domains=self.feature_domains_, cat_features=self.cat_features_, classes=self.classes_) estimators.append(estimator) datasets.append(Dataset(X=X[i*subset_size:(i+1)*subset_size], y=y[i*subset_size:(i+1)*subset_size])) estimators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='processes'))( delayed(lambda estimator, X, y: estimator.fit(X, y))(estimator, dataset.X, dataset.y) for estimator, dataset in zip(estimators, datasets) ) self.estimators_ = estimators self.accountant.spend(self.epsilon, 0) self.fitted_ = True return self
class DecisionTreeClassifier(BaseDecisionTreeClassifier, DiffprivlibMixin): r"""Decision Tree Classifier with differential privacy. This class implements the base differentially private decision tree classifier for the Random Forest classifier algorithm. Not meant to be used separately. Parameters ---------- epsilon: float, default: 1.0 Privacy parameter :math:`\epsilon`. cat_feature_threshold: int, default: 10 Threshold value used to determine categorical features. For example, value of ``10`` means any feature that has less than or equal to 10 unique values will be treated as a categorical feature. max_depth: int, default: 15 The maximum depth of the tree. random_state: float, optional Sets the numpy random seed. cat_features: array, optional Array of categorical feature indexes. If not provided, will be determined from the data based on the cat_feature_threshold. classes: array of shape (n_classes_, ), optional Array of class labels. If not provided, will be determined from the data. feature_domains: dict, optional A dictionary of domain values for all features where keys are the feature indexes in the training data and the values are an array of domain values for categorical features and an array of min and max values for continuous features. For example, if the training data is [[2, 'dog'], [5, 'cat'], [7, 'dog']], then the feature_domains would be {'0': [2, 7], '1': ['dog', 'cat']}. If not provided, feature domains will be constructed from the data, but this will result in :class:`.PrivacyLeakWarning`. Attributes ---------- n_features_in_: int The number of features when fit is performed. n_classes_: int The number of classes. classes_: array of shape (n_classes, ) The class labels. cat_features_: array of categorical feature indexes Categorical feature indexes. feature_domains_: dictionary of domain values mapped to feature indexes in the training data """ def __init__(self, cat_feature_threshold=10, max_depth=15, epsilon=1, random_state=None, feature_domains=None, cat_features=None, classes=None): # TODO: Remove try...except when sklearn v1.0 is min-requirement try: super().__init__( criterion=None, splitter=None, max_depth=max_depth, min_samples_split=None, min_samples_leaf=None, min_weight_fraction_leaf=None, max_features=None, random_state=random_state, max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None ) except TypeError: super().__init__( criterion=None, splitter=None, max_depth=max_depth, min_samples_split=None, min_samples_leaf=None, min_weight_fraction_leaf=None, max_features=None, random_state=random_state, max_leaf_nodes=None, min_impurity_decrease=None ) self.feature_domains = feature_domains self.cat_feature_threshold = cat_feature_threshold self.epsilon = epsilon self.cat_features = cat_features self.classes = classes if random_state is not None: np.random.seed(random_state) def _build(self, features, feature_domains, current_depth=1): if not features or current_depth >= self.max_depth+1: return DecisionNode(level=current_depth, classes=self.classes_) split_feature = np.random.choice(features) node = DecisionNode(level=current_depth, classes=self.classes_, split_feature=split_feature) if split_feature in self.cat_features_: node.set_split_type(DecisionNode.CAT_SPLIT) for value in feature_domains[str(split_feature)]: child_node = self._build([f for f in features if f != split_feature], feature_domains, current_depth+1) node.add_cat_child(value, child_node) else: node.set_split_type(DecisionNode.CONT_SPLIT) split_value = np.random.uniform(feature_domains[str(split_feature)][0], feature_domains[str(split_feature)][1]) node.set_split_value(split_value) left_domain = {k: v if k != str(split_feature) else [v[0], split_value] for k, v in feature_domains.items()} right_domain = {k: v if k != str(split_feature) else [split_value, v[1]] for k, v in feature_domains.items()} left_child = self._build(features, left_domain, current_depth+1) right_child = self._build(features, right_domain, current_depth+1) node.set_left_child(left_child) node.set_right_child(right_child) return node def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"): if sample_weight is not None: self._warn_unused_args("sample_weight") if not isinstance(self.cat_feature_threshold, numbers.Integral) or self.cat_feature_threshold < 0: raise ValueError('Categorical feature threshold should be a positive integer;' f'got {self.cat_feature_threshold}') if check_input: X, y = self._validate_data(X, y, multi_output=False) self.n_outputs_ = 1 self.feature_domains_ = self.feature_domains self.cat_features_ = self.cat_features if self.cat_features_ is None: self.cat_features_ = get_cat_features(X, self.cat_feature_threshold) if self.feature_domains_ is None: warnings.warn( "feature_domains parameter hasn't been specified, " "so falling back to determining domains from the data.\n" "This may result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `feature_domains` according to the documentation", PrivacyLeakWarning) self.feature_domains_ = get_feature_domains(X, self.cat_features_) self.classes_ = self.classes if self.classes_ is None: self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self.n_features_in_ = X.shape[1] features = list(range(self.n_features_in_)) self.tree_ = self._build(features, self.feature_domains_) for i, _ in enumerate(X): node = self.tree_.classify(X[i]) node.update_class_count(y[i].item()) self.tree_.set_noisy_label(self.epsilon, self.classes_) return self @property def n_features_(self): return self.n_features_in_ def _more_tags(self): return {} class DecisionNode: """Base Decision Node """ CONT_SPLIT = 0 CAT_SPLIT = 1 def __init__(self, level, classes, split_feature=None, split_value=None, split_type=None): """ Initialize DecisionNode Parameters ---------- level: int Node level in the tree classes: list List of class labels split_feature: int Split feature index split_value: Any Feature value to split at split_type: int Type of split """ self._level = level self._classes = classes self._split_type = split_type self._split_feature = split_feature self._split_value = split_value self._left_child = None self._right_child = None self._cat_children = {} self._class_counts = defaultdict(int) self._noisy_label = None @property def noisy_label(self): """Get noisy label""" return self._noisy_label def set_split_value(self, split_value): """Set split value""" self._split_value = split_value def set_split_type(self, split_type): """Set split type""" self._split_type = split_type def set_left_child(self, node): """Set left child of the node""" self._left_child = node def set_right_child(self, node): """Set right child of the node""" self._right_child = node def add_cat_child(self, cat_value, node): """Add a categorical child node""" self._cat_children[str(cat_value)] = node def is_leaf(self): """Check whether the node is leaf node""" return not self._left_child and not self._right_child and not self._cat_children def update_class_count(self, class_value): """Update the class count for the given class""" self._class_counts[class_value] += 1 def classify(self, x): """Classify the given data""" if self.is_leaf(): return self child = None if self._split_type == self.CAT_SPLIT: x_val = str(x[self._split_feature]) child = self._cat_children.get(x_val) else: x_val = x[self._split_feature] if x_val < self._split_value: child = self._left_child else: child = self._right_child if child is None: return self return child.classify(x) def set_noisy_label(self, epsilon, class_values): """Set the noisy label for this node""" if self.is_leaf(): if not self._noisy_label: for val in class_values: if val not in self._class_counts: self._class_counts[val] = 0 utility = list(self._class_counts.values()) candidates = list(self._class_counts.keys()) mech = PermuteAndFlip(epsilon=epsilon, sensitivity=1, monotonic=True, utility=utility, candidates=candidates) self._noisy_label = mech.randomise() else: if self._left_child: self._left_child.set_noisy_label(epsilon, class_values) if self._right_child: self._right_child.set_noisy_label(epsilon, class_values) for child_node in self._cat_children.values(): child_node.set_noisy_label(epsilon, class_values) def predict(self, X): """Predict using this node""" y = [] X = np.array(X) check_array(X) for x in X: node = self.classify(x) proba = np.zeros(len(self._classes)) proba[np.where(self._classes == node.noisy_label)[0].item()] = 1 y.append(proba) return np.array(y) def get_feature_domains(X, cat_features): """Calculate feature domains from the data. Parameters: X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. cat_features : array of integers List of categorical feature indexes Returns: [dict]: Dictionary with keys as feature indexes and values as feature domains. """ feature_domains = {} X_t = np.transpose(X) cont_features = list(set(range(X.shape[1])) - set(cat_features)) for i in cat_features: feature_domains[str(i)] = [str(x) for x in set(X_t[i])] for i in cont_features: vals = [float(x) for x in X_t[i]] feature_domains[str(i)] = [min(vals), max(vals)] return feature_domains def get_cat_features(X, feature_threshold=2): """Determine categorical features Parameters: X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. feature_threshold: int, defaults to 2. Threshold value used to determine categorical features. For example, value of ``10`` means any feature that has less than or equal to 10 unique values will be treated as a categorical feature. Returns: [list]: List of categorical feature indexes """ n_features = X.shape[1] cat_features = [] for i in range(n_features): values = set(X[:, i]) if len(values) <= feature_threshold: cat_features.append(i) return cat_features def calc_tree_depth(n_cont_features, n_cat_features, max_depth=15): """Calculate tree depth Args: n_cont_features (int): Number of continuous features n_cat_features ([type]): Number of categorical features max_depth (int, optional): Max depth tree. Defaults to 15. Returns: [int]: Final depth tree """ if n_cont_features < 1: return min(max_depth, np.floor(n_cat_features / 2.)) # Designed using balls-in-bins probability. See the paper for details. m = float(n_cont_features) depth = 0 expected_empty = m # the number of unique attributes not selected so far while expected_empty > m / 2.: # repeat until we have less than half the attributes being empty expected_empty = m * ((m - 1.) / m) ** depth depth += 1 # the above was only for half the numerical attributes. now add half the categorical attributes final_depth = np.floor(depth + (n_cat_features / 2.)) return min(max_depth, final_depth)