Sklearn进阶

正交化 Normalization

数据标准化

preprocessing.scale( ) preprocessing.minmax_scale(X,feature_range=(0,1) ): feature_range是设置数据标准化后数据的范围,默认为0~1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from sklearn import preprocessing #标准化数据模块
import numpy as np

#建立Array
a = np.array([[10, 2.7, 3.6],
[-100, 5, -2],
[120, 20, 40]], dtype=np.float64)
# 打印出原来的a
print(a)
#将normalized后的a打印出
print(preprocessing.scale(a))
# [[ 0. -0.85170713 -0.55138018]
# [-1.22474487 -0.55187146 -0.852133 ]
# [ 1.22474487 1.40357859 1.40351318]]

加载模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 标准化数据模块
from sklearn import preprocessing
import numpy as np

# 将资料分割成train与test的模块
from sklearn.model_selection import train_test_split

# 生成适合做classification资料的模块
from sklearn.datasets.samples_generator import make_classification

# Support Vector Machine中的Support Vector Classifier
from sklearn.svm import SVC

# 可视化数据的模块
import matplotlib.pyplot as plt

生成数据-生成适合做Classification数据

1
2
3
4
5
6
7
8
9
10
#生成具有2种属性的300笔数据
X, y = make_classification(
n_samples=300, n_features=2,
n_redundant=0, n_informative=2,
random_state=22, n_clusters_per_class=1,
scale=100)

#可视化数据
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

原始数据可视化 ### 训练数据

1
2
3
4
5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
# 0.477777777778

标准化数据及可视化

1
2
3
4
5
6
7
8
9
X = preprocessing.scale(X)
#可视化数据,显示标准化后范围
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
# 0.9
标准化后数据

标准化后数据