KNN演算法是機器學習最為簡單的演算法之一,具體的思想這裡不做講解了,可以自行上網查閱。本文主要是用python來模仿sklearn實現knn演算法。
匯入所需的庫
import
numpy
as
np
from
math
import
sqrt
from
collections
import
Counter
knn的核心程式碼
def
KNN_classify
(
k
,
X_train
,
y_train
,
x
):
“”“
k:表示knn的中k的值
X_train: 訓練集的features
y_train: 訓練集的labels
x: 新的資料
”“”
assert
1
<=
k
<=
X_train
。
shape
[
0
],
“k must be valid”
assert
X_train
。
shape
[
0
]
==
y_train
。
shape
[
0
],
\
“the size of X_train must equal to the size of y_train”
assert
X_train
。
shape
[
1
]
==
x
。
shape
[
0
],
\
“the feature number of x must to be equal to X_train”
# 計算新來的資料x與整個訓練資料中每個樣本資料的距離
distances
=
[
sqrt
(
np
。
sum
((
x_train
-
x
)
**
2
))
for
x_train
in
X_train
]
nearest
=
np
。
argsort
(
distances
)
# 對距離排序並返回對應的索引
topK_y
=
[
y_train
[
i
]
for
i
in
nearest
]
# 返回最近的k個距離對應的分類
votes
=
Counter
(
topK_y
)
# 統計屬於每個分類的樣本數
return
votes
。
most_common
(
1
)[
0
][
0
]
# 返回屬於樣本數最多的分類結果
虛擬資料進行驗證
x
=
[[
0
,
0
],
[
1
,
1
],
[
2
,
2
],
[
10
,
10
],
[
11
,
11
],
[
12
,
12
]]
y
=
[
0
,
0
,
0
,
1
,
1
,
1
]
X_train
=
np
。
array
(
x
)
y_train
=
np
。
array
(
y
)
import
matplotlib。pyplot
as
plt
plt
。
scatter
(
X_train
[:
3
,
0
],
X_train
[:
3
,
1
],
color
=
‘red’
)
plt
。
scatter
(
X_train
[
3
:,
0
],
X_train
[
3
:,
1
],
color
=
‘blue’
)
plt
。
show
()
x
=
np
。
array
([
13
,
13
])
KNN_classify
(
2
,
X_train
,
y_train
,
x
)
1
sklearn實現knn分類
from
sklearn。neighbors
import
KNeighborsClassifier
kNN_classifier
=
KNeighborsClassifier
()
kNN_classifier
。
fit
(
X_train
,
y_train
)
KNeighborsClassifier
(
algorithm
=
‘auto’
,
leaf_size
=
30
,
metric
=
‘minkowski’
,
metric_params
=
None
,
n_jobs
=
None
,
n_neighbors
=
5
,
p
=
2
,
weights
=
‘uniform’
)
X_predict
=
x
。
reshape
(
1
,
-
1
)
kNN_classifier
。
predict
(
X_predict
)
array
([
1
])
模仿sklearn重新封裝knn
import
numpy
as
np
from
math
import
sqrt
from
collections
import
Counter
class
KNNClassifier
:
def
__init__
(
self
,
k
):
“”“初始化knn分類器”“”
assert
k
>=
1
,
“k must be valid”
self
。
k
=
k
# knn中的k
self
。
_X_train
=
None
# 訓練資料集在類中,使用者不能隨意操作,故設定為私有
self
。
_y_train
=
None
def
fit
(
self
,
X_train
,
y_train
):
“”“根據訓練資料集X_train和y_train訓練kNN分類器”“”
assert
X_train
。
shape
[
0
]
==
y_train
。
shape
[
0
],
\
“the size of X_train must equal to the size of y_train”
assert
self
。
k
<=
X_train
。
shape
[
0
],
“the size of X_train must be at least k。”
self
。
_X_train
=
X_train
self
。
_y_train
=
y_train
return
self
# 模仿sklearn,呼叫fit函式會返回自身
def
predict
(
self
,
X_predict
):
“”“給定待預測資料集X_predict,跟sklearn一樣,要求使用者傳來的是陣列格式的資料,
返回表示X_predict的結果向量”“”
assert
self
。
_X_train
is
not
None
and
self
。
_y_train
is
not
None
,
\
“must fit before predict!”
assert
X_predict
。
shape
[
1
]
==
self
。
_X_train
。
shape
[
1
],
\
“the feature number of X_predict must be equal to X_train”
# 預測X_predict矩陣每一行所屬的類別
y_predict
=
[
self
。
_predict
(
x
)
for
x
in
X_predict
]
return
np
。
array
(
y_predict
)
# 返回的結果也遵循sklearn
def
_predict
(
self
,
x
):
“”“給定單個待預測的資料x,返回x_predict的預測結果值”“”
# 先判斷x是合法的
assert
x
。
shape
[
0
]
==
self
。
_X_train
。
shape
[
1
],
\
“the feature number of x must be equal to X_train”
# 計算新來的資料與整個訓練資料的距離
distances
=
[
sqrt
(
np
。
sum
((
x_train
-
x
)
**
2
))
for
x_train
in
self
。
_X_train
]
nearest
=
np
。
argsort
(
distances
)
# 對距離排序並返回對應的索引
topK_y
=
[
self
。
_y_train
[
i
]
for
i
in
nearest
[:
self
。
k
]]
# 返回最近的k個距離對應的分類
votes
=
Counter
(
topK_y
)
return
votes
。
most_common
(
1
)[
0
][
0
]
def
__repr__
(
self
):
return
“KNN(k=
%d
)”
%
self
。
k
X_train
=
np
。
array
([[
0
,
0
],
[
1
,
1
],
[
2
,
2
],
[
10
,
10
],
[
11
,
11
],
[
12
,
12
]])
y_train
=
np
。
array
([
0
,
0
,
0
,
1
,
1
,
1
])
import
matplotlib。pyplot
as
plt
plt
。
scatter
(
X_train
[:
3
,
0
],
X_train
[:
3
,
1
],
color
=
‘red’
)
plt
。
scatter
(
X_train
[
3
:,
0
],
X_train
[
3
:,
1
],
color
=
‘blue’
)
plt
。
show
()
x
=
np
。
array
([[
13
,
13
],[
-
1
,
-
1
]])
knn_clf
=
KNNClassifier
(
k
=
2
)
knn_clf
。
fit
(
X_train
,
y_train
)
KNN
(
k
=
2
)
knn_clf
。
predict
(
x
)
array
([
1
,
0
])