决策树之CART算法

CART算法是二叉决策树。本来这个算法难得我想放弃,copy某位博主的代码修改修改就交上去的。结果越改越多,最后除了核心算法没改,整个框架结构都被我改了。被我修改得看起来貌似有点累赘,但好像也是必要的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import numpy as np
from collections import Counter

def CreateData():
x = np.array([['有房','单身','125'],
['无房','已婚','100'],
['无房','单身','70'],
['有房','已婚','120'],
['无房','离异','95'],
['无房','已婚','60'],
['有房','离异','220'],
['无房','单身','85'],
['无房','已婚','75'],
['无房','单身','90']])
signal = np.array([0,0,1]) #用0表示特征是离散的,1表示是连续的
y = np.array(['否','否','否','否','是','否','否','是','否','是'])
return x, y, signal

def DataProcessing(x, signal):
for i in range(len(signal)):
if signal[i] == 1:
tmp = x[:,i]
int_list = [int(s) for s in tmp]
sort_lst = sorted(int_list)
split_point = [j for j in range(sort_lst[0],sort_lst[-1],int((sort_lst[-1]-sort_lst[0])/len(sort_lst)))]
smin = 20000 #初始化方差
jmin = 0
for k in split_point:
x1, x2 = [], []
for j in range(len(int_list)):
if k >= int_list[j]:
x1.append(int_list[j])
else:
x2.append(int_list[j])
#print(x1,x2)
r1, r2 = 0, 0
avg1 = sum(x1) / len(x1)
avg2 = sum(x2) / len(x2)
#print(avg1, avg2)
for j in x1:
r1 += (j - avg1) ** 2
for j in x2:
r2 += (j - avg2) ** 2
#print(r1+r2)
if r1 + r2 < smin:
jmin = k
smin = r1 + r2

for j in range(len(int_list)):
if int_list[j] > jmin:
x[j][i] = '高'
else:
x[j][i] = '低'

return x


#基尼指数
def Gini(y):
#Counter()用来统计不同类别的数量
counter = Counter(y)
g = 1
for num in counter.values():
p = num / len(y)
g -= p * p
return g

#CART算法
class DecisionTree:
def __init__(self):
self.tree = {}
self.lst = []# 若有2个以上特征值的特征需记下哪个特征值作为切分点时得出的基尼指数最小

#训练决策树
def fit(self,x,y):
cols = list(range(x.shape[1]))# [0,1,2]
#对x的每一列数据,计算基尼指数
self.tree = self._genTree(cols, x, y)

#递归生成决策树
def _genTree(self, cols, x, y):
# 计算基尼指数,求得基尼指数最小的特征
imin = cols[0] # 初始化最小基尼指数的特征
emin = 1 # 初始化最小基尼指数为1
e = 0.01 # 阈值
st = '' # 若有2个以上特征值的特征需记下哪个特征值作为切分点时得出的基尼指数最小
for i in cols:
coli = x[:,i]#拿到第i个特征数据
#在特征为i的情况下,样本集的基尼指数
if 0 < len(set(coli)) <= 2:
gini = sum([len(y[coli==d]) / len(coli) * Gini(y[coli==d]) for d in set(coli)])
else:
emin_tmp = 1
setlst = list(set(coli))
for d in range(len(setlst)):
gini = len(y[coli==setlst[d]]) / len(coli) * Gini(y[coli==setlst[d]])
gini += len(y[coli!=setlst[d]]) / len(coli) * Gini(y[coli!=setlst[d]])
if gini <= emin_tmp:
st = setlst[d]
emin_tmp = gini
gini = emin_tmp

if gini <= emin:
imin = i
emin = gini
self.lst.append(st)

#根据最优特征和最优切分点,生成两个子节点
newtree={}
mincol = x[:,imin]
cols.remove(imin)
#针对这个特征的每个特征值,进一步划分树
if 0 < len(set(mincol)) <= 2:
for d in set(mincol):
gini = Gini(y[mincol==d]) # 计算基尼指数
if gini < e or len(cols) == 0:#已经完全分开或已无特征
y_label = Counter(y[mincol==d])
y_num = max(y_label.values())
for key,values in y_label.items():
if values == y_num:
newtree[d] = key
else:#还需要进一步细分
newtree[d] = self._genTree(cols.copy(), x[mincol==d, :], y[mincol==d])
else:
gini = Gini(y[mincol==st])
if gini < e or len(cols) == 0:#已经完全分开或已无特征
y_label = Counter(y[mincol==st])
y_num = max(y_label.values())
for key,values in y_label.items():
if values == y_num:
newtree[st] = key
else:#还需要进一步细分
newtree[st] = self._genTree(cols.copy(), x[mincol==st, :], y[mincol==st])

gini = Gini(y[mincol!=st])
if gini < e or len(cols) == 0:#已经完全分开或已无特征
y_label = Counter(y[mincol!=st])
y_num = max(y_label.values())
for key,values in y_label.items():
if values == y_num:
newtree['非'+st] = key
else:#还需要进一步细分
newtree['非'+st] = self._genTree(cols.copy(), x[mincol!=st, :], y[mincol!=st])

return {imin: newtree}#将列号作为索引,返回新生成的树

#预测新样本
def predict(self, x):
x = x.tolist()
y = [None for i in range(len(x))]
for i in range(len(x)):
j = 0
predictDict = self.tree
while predictDict != '是' and predictDict != '否':
col = list(predictDict.keys())[0]
predictDict = predictDict[col]
if x[i][col] not in predictDict.keys():
predictDict = predictDict['非'+self.lst[j]]
else:
predictDict = predictDict[x[i][col]]
j += 1
else:
y[i] = predictDict
return y

if __name__ == '__main__':
x, y, signal = CreateData()
x = DataProcessing(x, signal)
dt = DecisionTree()
dt.fit(x, y)
print(dt.tree)
print(dt.predict(x))