k-近邻算法

2022年4月5日：这周又来个新作业，k-近邻算法手算很容易，但用代码实现确实有点困难了。只能在网上找找参考：https://github.com/zlxy9892/ml_code/blob/master/basic_algorithm/knn 看能不能看懂牛牛们写的算法吧。

2022年5月4日：我们的第一次实验报告竟然是k近邻。想偷懒都不行了，这几天基本上看懂了上面链接的knn算法，先把上面连接的knn算法做个解析。

# -*- coding: utf-8 -*-
# knn.py
import numpy as np
import operator

class KNN(object):
	#取3个近邻点
    def __init__(self, k=3):
        self.k = k

    def fit(self, x, y):
        self.x = x
        self.y = y

    def _square_distance(self, v1, v2):
        #np.square()计算数组中每个元素的平方值，np.sum()求和
        #欧氏距离
        return np.sum(np.square(v1-v2))

    def _vote(self, ys):
        #np.unique()去除数组中的重复元素
        ys_unique = np.unique(ys)
        vote_dict = {}
        for y in ys:
            if y not in vote_dict.keys():
                vote_dict[y] = 1
            else:
                vote_dict[y] += 1
        #items()返回可遍历的(键, 值)元组数组
        #key=operator.itemgetter(1)与sorted结合，表明以元组的第1列进行排序
        sorted_vote_dict = sorted(vote_dict.items(), key=operator.itemgetter(1), reverse=True)
        return sorted_vote_dict[0][0]

    def predict(self, x):
        y_pred = []
        for i in range(len(x)):
            dist_arr = [self._square_distance(x[i], self.x[j]) for j in range(len(self.x))]
            #np.argsort()将dist_arr中的元素从小到大排列，提取其对应的index(索引)
            sorted_index = np.argsort(dist_arr)
            top_k_index = sorted_index[:self.k]
            y_pred.append(self._vote(ys=self.y[top_k_index]))
        return np.array(y_pred)

    def score(self, y_true=None, y_pred=None):
        if y_true is None and y_pred is None:
            y_pred = self.predict(self.x)
            y_true = self.y
        score = 0.0
        for i in range(len(y_true)):
            if y_true[i] == y_pred[i]:
                score += 1
        score /= len(y_true)
        return score

# -*- coding: utf-8 -*-
# train.py
import numpy as np
import matplotlib.pyplot as plt
from knn import *

#np.random.seed()用来设置随机数的种子
# data generation
np.random.seed(314)

# 创建第一个数据集
data_size_1 = 300
#np.random.normal()的意思是一个正态分布,loc=正态分布的均值，scale=正态分布的标准差,size=输入数据的shape
#第一个数据集以(5,4)为圆心的正态分布
x1_1 = np.random.normal(loc=5.0, scale=1.0, size=data_size_1)#这里的size表示(300,)表示300行
x2_1 = np.random.normal(loc=4.0, scale=1.0, size=data_size_1)
y_1 = [0 for _ in range(data_size_1)]#y1一个类别，y2一个类别

# 创建第二个数据集
data_size_2 = 400
#第二个数据集以(10,8)为圆心的正态分布
x1_2 = np.random.normal(loc=10.0, scale=2.0, size=data_size_2)#这里的size表示(400,)表示400行
x2_2 = np.random.normal(loc=8.0, scale=2.0, size=data_size_2)
y_2 = [1 for _ in range(data_size_2)]

#np.concatenate()对array进行拼接
x1 = np.concatenate((x1_1, x1_2), axis=0)#x1拼接后有700行，即x的坐标有700个
x2 = np.concatenate((x2_1, x2_2), axis=0)#坐标轴的y的坐标有700个
#x1.reshape(-1,1)表示不知道几行，改成1列，这里意思是最后一维度是1
#np.hstack()将参数元组的元素数组按水平方向进行叠加，叠加后相当于一个列表作为一个坐标点
x = np.hstack((x1.reshape(-1,1), x2.reshape(-1,1)))
y = np.concatenate((y_1, y_2), axis=0)#y1y2进行拼接后有700个数据

data_size_all = data_size_1+data_size_2
shuffled_index = np.random.permutation(data_size_all)#对0到700的数随机排列序列，返回一个列表
x = x[shuffled_index]
y = y[shuffled_index]

#70%作为训练集，30%作为测试集
split_index = int(data_size_all*0.7)
x_train = x[:split_index]
y_train = y[:split_index]
x_test = x[split_index:]
y_test = y[split_index:]

# visualize data
#x_train[:,0]取训练集中所有集合的第0个数据，c表示颜色，也就是y集合中不同的数值表示不同类别
plt.scatter(x_train[:,0], x_train[:,1], c=y_train, marker='.')
plt.show()
plt.scatter(x_test[:,0], x_test[:,1], c=y_test, marker='.')
plt.show()

# data preprocessing
#数据预处理，这里不懂
x_train = (x_train - np.min(x_train, axis=0)) / (np.max(x_train, axis=0) - np.min(x_train, axis=0))
x_test = (x_test - np.min(x_test, axis=0)) / (np.max(x_test, axis=0) - np.min(x_test, axis=0))

# knn classifier
clf = KNN(k=3)
clf.fit(x_train, y_train)

print('train accuracy: {:.3}'.format(clf.score()))

y_test_pred = clf.predict(x_test)
print('test accuracy: {:.3}'.format(clf.score(y_test, y_test_pred)))

其实看懂了觉得很容易是不是，作者在knn.py中用了欧氏距离，k值为3，算测试点到全部训练点的距离找前k个最近的训练点进行投票，还加上了一个判断准确率的函数。在train.py中创建随机样本集，将样本集分为训练集和测试集，并且可视化样本集，利用knn.py判断测试集被正确分类的准确率。

为什么说它容易呢？因为作者没有利用到一种更高效率的搜索最近邻点的方法——KD树。在面对海量数据时，计算测试点到全部训练点的距离会花费很长时间，而且也根本没有必要计算测试点到全部训练点的距离，这一点都不“机器学习”。而且关于K值的选取，是很有讲究的。具体看机器学习 3.2.3 K值的选择

当然啦以我目前的水平还不足以解决这些问题，我还是借鉴了作者的代码才能成功写好我的实验报告，所以以上这些问题也是将来我的代码需要完善的地方。

import numpy as np
import operator
import matplotlib.pyplot as plt

class KNN:
    def __init__(self,samples,s_labels,k=3):
        self.samples = samples
        self.s_labels = s_labels
        self.k = k

    #曼哈顿距离只计算水平或垂直距离，有维度的限制。而欧氏距离可用于任何空间的距离计算问题，所以数据点可以存在于任何空间，欧氏距离是更可行的选择
    def distance(self,trains,samples,flag):
        if flag != 2:
            return np.sqrt(np.sum(np.square(trains-samples)))#欧氏距离
        else:
            return np.sum(np.fabs(trains-samples))#曼哈顿距离

    def vote(self,s_labels):
        vote_dict = {}
        for i in s_labels:
            if i not in vote_dict.keys():
                vote_dict[i] = 1
            else:
                vote_dict[i] += 1
        #items()返回可遍历的(键, 值)元组数组
        #key=operator.itemgetter(1)与sorted结合，表明以元组的第1列进行排序
        sorted_vote_dict = sorted(vote_dict.items(), key=operator.itemgetter(1), reverse=True)
        return sorted_vote_dict[0][0]

    #直接计算测试点与样本集的距离，选出k个最近
    def TopKPredict(self,trains,flag=1):
        t_labels = []
        for i in range(len(trains)):
            #每个测试点生成一个距离数组
            dist_arr = [self.distance(trains[i],self.samples[j],flag) for j in range(len(self.samples))]
            #np.argsort()将dist_arr中的元素从小到大排列，提取其对应的index(索引)
            sorted_index = np.argsort(dist_arr)
            #找距离最近的前k个索引
            top_k_index = sorted_index[:self.k]
            #对应测试点索引的类别，将k个类别进行投票
            t_labels.append(self.vote(self.s_labels[top_k_index]))

        return np.array(t_labels)

class Picture:
    def __init__(self,samples,s_labels,trains,t_labels):
        #.表示训练点，x表示测试点，测试点与哪种颜色相同即为哪种类别
        s_colors = []
        t_colors = []
        for i in s_labels:
            if i == 1:
                s_colors.append('b')
            else:
                s_colors.append('y')
        for j in t_labels:
            if j == 1:
                t_colors.append('b')
            else:
                t_colors.append('y')
        plt.scatter(samples[:,0], samples[:,1], c=s_colors, marker='.')
        plt.scatter(trains[:,0], trains[:,1], c=t_colors, marker='x')

    def Show(self):
        plt.show()

#生成已有标注的样本集与未标注的训练集
def createdata():
    #np.array()用来产生数组
    
    #手动赋值
    #samples = np.array([[2,3],[5,4],[9,6],[4,7],[8,1],[7,2]])  
    #s_labels = np.array([-1,-1,1,1,-1,-1])
    #trains = np.array([[3,4.5],[2,4.5],[6,6]])
    
    #随机赋值
    datasize_1 = 50
    datasize_2 = 60
    trainsize = 5
    #创建正例和负例的训练集
    samples_1 = np.random.normal(loc=10,scale=2,size=(datasize_1,2))
    s_labels_1 = np.array([1 for i in range(datasize_1)])
    samples_2 = np.random.normal(loc=5,scale=2,size=(datasize_2,2))
    s_labels_2 = np.array([-1 for i in range(datasize_2)])    
    #拼接训练集
    samples = np.concatenate((samples_1,samples_2),axis=0)
    s_labels = np.concatenate((s_labels_1,s_labels_2),axis=0)   
    #创建测试集
    trains = np.random.randint(2,12,size=[trainsize,2])
    
    return samples,s_labels,trains  #返回值如果是两个及以上，是以元组的形式返回

if __name__ == '__main__':
#输入：训练集
#输出：k个最近邻点，根据k个最近邻点的类别判断测试点的类别，投票原则
    samples,s_labels,trains=createdata() #将createdata的三个返回值分别赋给samples、labels、trains
    knn = KNN(samples,s_labels,k=1)
    
    #默认使用欧氏距离，flag=2表示使用曼哈顿距离
    t_labels = knn.TopKPredict(trains,flag=1)
    pic = Picture(samples,s_labels,trains,t_labels)
    pic.Show()