1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
| import random import csv
def CreateData(): sampleset = [['青绿','蜷缩','浊响','清晰','凹陷','硬滑','是'], ['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑','是'], ['乌黑','蜷缩','浊响','清晰','凹陷','硬滑','是'], ['青绿','蜷缩','沉闷','清晰','凹陷','硬滑','是'], ['浅白','蜷缩','浊响','清晰','凹陷','硬滑','是'], ['青绿','稍蜷','浊响','清晰','稍凹','软粘','是'], ['乌黑','稍蜷','浊响','稍糊','稍凹','软粘','是'], ['乌黑','稍蜷','浊响','清晰','稍凹','硬滑','是'], ['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑','否'], ['青绿','硬挺','清脆','清晰','平坦','软粘','否'], ['浅白','硬挺','清脆','模糊','平坦','硬滑','否'], ['浅白','蜷缩','浊响','模糊','平坦','软粘','否'], ['青绿','稍蜷','浊响','稍糊','凹陷','硬滑','否'], ['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑','否'], ['乌黑','稍蜷','浊响','清晰','稍凹','软粘','否'], ['浅白','蜷缩','浊响','模糊','平坦','硬滑','否'], ['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑','否']]
return sampleset
def DataPreprocessing(sampleset): random.shuffle(sampleset) split_index = int(len(sampleset)*0.9) trainset = sampleset[:split_index] testset = sampleset[split_index:] return trainset, testset
class Bayes: def __init__(self, trainset, testset, n=1): self.trainset = trainset self.testset = testset self.n = n
def cal_prior_prob(self, trainset): labellist = [trainset[i][-1] for i in range(len(trainset))] labeldict = {} for label in labellist: if label not in labeldict: labeldict[label] = 0 labeldict[label] += 1
labelpriorprob = {} for label in labeldict.keys(): labelpriorprob[label] = (labeldict[label] + self.n) / (len(trainset) + len(self.testset) * self.n) print("类别的先验概率为:",end="") print(labelpriorprob) return labelpriorprob def cal_cond_prob(self, trainset): fea_num = len(trainset[0]) - 1
fealist = [] for i in range(fea_num): fealist.append([]) for j in range(len(trainset)): if trainset[j][i] not in fealist[i]: fealist[i].append(trainset[j][i])
separated = {} for i in range(len(trainset)): row_vetor = trainset[i] if row_vetor[-1] not in separated: separated[row_vetor[-1]] = [] separated[row_vetor[-1]].append(row_vetor[:-1])
feacondprob = {}
for i in separated.keys(): temp = [] print() print("'{0}'类别的各特征的条件概率为:".format(i)) for k in range(fea_num): feadict = {} for j in range(len(fealist[k])): if fealist[k][j] not in feadict: feadict[fealist[k][j]] = 0
for j in range(len(separated[i])): feadict[separated[i][j][k]] += 1
for j in feadict.keys(): feadict[j] = (feadict[j] + self.n) / (len(separated[i]) + len(feadict) * self.n) print(feadict) temp.append(feadict) feacondprob[i] = temp return feacondprob
def predict(self, trainset, testset): labelpriorprob = self.cal_prior_prob(trainset) feacondprob = self.cal_cond_prob(trainset) testlabels = []
for i in range(len(testset)): bayesProbability = {} for label in labelpriorprob: tempProb = labelpriorprob[label] featureList = feacondprob[label] for k in range(len(testset[0])-1): tempProb *= featureList[k][testset[i][k]] bayesProbability[label] = tempProb result = sorted(bayesProbability.items(), key=lambda x:x[1], reverse=True) print("测试样本{0}的各类别概率:".format(i),end="") print(result) print() testlabels.append(result[0][0])
return testlabels if __name__ == '__main__': sampleset = CreateData() print("样本集为:") print(sampleset) trainset, testset = DataPreprocessing(sampleset) print("其中训练集为:") print(trainset) print("其中测试集为:") print(testset) print()
bayes = Bayes(trainset, testset, n=0) print("使用最大似然法判断测试集类别:") testlabels = bayes.predict(trainset, testset) print("测试集各实例点的最终结果:{0}".format(testlabels)) print() print()
bayes = Bayes(trainset, testset, n=1) print("使用拉普拉斯平滑判断测试集类别:") testlabels = bayes.predict(trainset, testset) print("测试集各实例点的最终结果:{0}".format(testlabels))
|