sklearn中K近邻的使用(python)
摘要:本文使用K近邻模型进行回归,分类;
00 构造数据
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
x=np.pi*2*np.random.rand(100)
y=np.sin(x)
y[::5]+=(np.random.rand(20)-0.5)*2
dex1=np.random.choice(100,75,replace=False)
dex2=[]
for i in range(100):
if i not in dex1:
dex2.append(i)
train_x=x[dex1].reshape(-1,1)
train_y=y[dex1].reshape(-1,1)
test_x=x[dex2].reshape(-1,1)
test_y=y[dex2].reshape(-1,1)
01 KNN回归
regre=neighbors.KNeighborsRegressor()
regre.fit(train_x,train_y)
regre.score(test_x,test_y)
Out[112]: 0.9719249717643359
regre.kneighbors(test_x,n_neighbors=5,return_distance=True)
研究参数weights,n_neighbors,p对模型预测性能的影响:
fig=plt.figure()
weights=['uniform','distance']
ks=np.linspace(1,len(train_y),50,dtype='int')
for weight in weights:
scor=[]
for k in ks:
regre=neighbors.KNeighborsRegressor(weights=weight,n_neighbors=k)
regre.fit(train_x,train_y)
scor.append(regre.score(test_x,test_y))
plt.plot(ks,scor,label=weight)
plt.legend()
fig=plt.figure()
ps=[1,2,10]
ks=np.linspace(1,len(train_y),50,dtype='int')
for p in ps:
scor=[]
for k in ks:
regre=neighbors.KNeighborsRegressor(p=p,n_neighbors=k)
regre.fit(train_x,train_y)
scor.append(regre.score(test_x,test_y))
plt.plot(ks,scor,label='p='+str(p))
plt.legend(loc='best')
02 获取sklearn中数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets,neighbors
digits=datasets.load_digits()
dex1=np.random.choice(1797,1500,replace=False)
dex2=[]
for i in range(1797):
if i not in dex1:
dex2.append(i)
train_x=digits.data[dex1]
train_y=digits.target[dex1]
test_x=digits.data[dex2]
test_y=digits.target[dex2]
03 KNN分类
classi=neighbors.KNeighborsClassifier()
classi.fit(train_x,train_y)
classi.score(test_x,test_y)
Out[120]: 0.9966329966329966
研究参数weights,n_neighbors,p对模型预测性能的影响:
fig=plt.figure()
weights=['uniform','distance']
ks=np.linspace(1,len(train_y),100,dtype='int')
for weight in weights:
scor=[]
for k in ks:
classi=neighbors.KNeighborsClassifier(weights=weight,n_neighbors=k)
classi.fit(train_x,train_y)
scor.append(classi.score(test_x,test_y))
plt.plot(ks,scor,label=weight)
plt.legend(loc='best')
fig=plt.figure()
ps=[1,2,10]
ks=np.linspace(1,len(train_y),100,dtype='int')
for p in ps:
scor=[]
for k in ks:
classi=neighbors.KNeighborsClassifier(p=p,n_neighbors=k)
classi.fit(train_x,train_y)
scor.append(classi.score(test_x,test_y))
plt.plot(ks,scor,label='p='+str(p))
plt.legend(loc='best')
04 总结
01 K近邻模型的重要参数,K值,距离,计权,即上文涉及的;
02 K近邻模型既可以用于分类(多数表决),也可以用于回归(均值);
03 K近邻模型是典型的 lazy-learning;