1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| class DecisionTreeClassifierWithMissing(DecisionTreeClassifier): def _split_node(self, X, y, sample_weight, depth, impurity, n_node_samples, weighted_n_node_samples, feature, threshold): left_indices = [] right_indices = [] for i in range(X.shape[0]): if np.isnan(X[i, feature]): left_indices.append(i) right_indices.append(i) elif X[i, feature] <= threshold: left_indices.append(i) else: right_indices.append(i)
left_indices = np.array(left_indices) right_indices = np.array(right_indices)
if sample_weight is None: left_sample_weight = np.ones(left_indices.shape[0]) right_sample_weight = np.ones(right_indices.shape[0]) else: left_sample_weight = sample_weight[left_indices] right_sample_weight = sample_weight[right_indices]
left_child = super()._split_node(X[left_indices], y[left_indices], left_sample_weight, depth + 1, impurity, n_node_samples, weighted_n_node_samples, feature, threshold) right_child = super()._split_node(X[right_indices], y[right_indices], right_sample_weight, depth + 1, impurity, n_node_samples, weighted_n_node_samples, feature, threshold)
return left_child, right_child
def predict(self, X): predictions = [] for i in range(X.shape[0]): node_id = 0 while self.tree_.children_left[node_id] != self.tree_.children_right[node_id]: feature = self.tree_.feature[node_id] threshold = self.tree_.threshold[node_id] if np.isnan(X[i, feature]): left_node_id = self.tree_.children_left[node_id] right_node_id = self.tree_.children_right[node_id] left_pred = self._predict_from_node(X[i], left_node_id) right_pred = self._predict_from_node(X[i], right_node_id) votes = [left_pred, right_pred] classes, counts = np.unique(votes, return_counts=True) prediction = classes[np.argmax(counts)] break elif X[i, feature] <= threshold: node_id = self.tree_.children_left[node_id] else: node_id = self.tree_.children_right[node_id] else: value = self.tree_.value[node_id][0] prediction = np.argmax(value) predictions.append(prediction) return np.array(predictions)
def _predict_from_node(self, x, node_id): while self.tree_.children_left[node_id] != self.tree_.children_right[node_id]: feature = self.tree_.feature[node_id] threshold = self.tree_.threshold[node_id] if np.isnan(x[feature]): left_node_id = self.tree_.children_left[node_id] right_node_id = self.tree_.children_right[node_id] left_pred = self._predict_from_node(x, left_node_id) right_pred = self._predict_from_node(x, right_node_id) votes = [left_pred, right_pred] classes, counts = np.unique(votes, return_counts=True) return classes[np.argmax(counts)] elif x[feature] <= threshold: node_id = self.tree_.children_left[node_id] else: node_id = self.tree_.children_right[node_id] value = self.tree_.value[node_id][0] return np.argmax(value)
|