简单线性回归(最小二乘法)
0.引入依赖
import numpy as np
import matplotlib.pyplot as plt
1.导入数据
points = np.genfromtxt('data.csv', delimiter=',')
points[0, 0]
32.50234526945303
# 提取 points 中的两列数据,分别作为 x, y
x = points[:, 0]
y = points[:, 1]
# 用 plt 画出散点图
plt.scatter(x, y)
plt.show()
2.定义损失函数
求解 $w$ 和 $b$,使得 $E(w, b) = \sum_{i=1}^{m}(y_{i} - wx_{i} -b)^{2}$ 最小化的过程,称为线性回归模型的“最小二乘参数估计”。
# 损失函数是系数的函数,另外还需要传输数据 points(x,y)
def computerCost(w, b, points):
total_cost = 0.0
M = len(points)
# 逐点计算平方损失误差,然后求平均数
for i in range(M):
x = points[i, 0]
y = points[i, 1]
total_cost += (y - w * x - b)**2
return total_cost / M
3.算法拟合函数
$w = \frac{\sum_{i=1}^{m}y_{i}(x_{i} - \bar{x})}{\sum_{i=1}^{m}x_{i}^{2} - \frac{1}{m}(\sum_{i=1}^{m}x_{i})^{2}}$
$b = \frac{1}{m}\sum_{i=1}^{m}(y_{i} - wx_{i})$
$\bar{x} = \frac{1}{m}\sum_{i=1}^{m}x_{i}$
# 先定义求均值的函数
def average(data):
return sum(data)/len(data)
# 定义核心拟合函数
def fit(points):
M = len(points)
x_bar = average(points[:, 0]) # x = points[:,0]
sum_yx, sum_x2, sum_delta = 0, 0, 0
for i in range(M):
x = points[i, 0]
y = points[i, 1]
sum_yx += y * (x - x_bar)
sum_x2 += x ** 2
# 根据公式计算 w
w = sum_yx / (sum_x2 - M * (x_bar ** 2))
# 计算 b
for i in range(M):
x = points[i, 0]
y = points[i, 1]
sum_delta += (y - w * x)
b = sum_delta / M
return w, b
4.测试
w, b = fit(points)
cost = computerCost(w, b, points)
print('W is : ', w)
print('B is : ', b)
print('Cost : ', cost)
W is : 1.3224310227553846
B is : 7.991020982269173
Cost : 110.25738346621313
5.画出拟合曲线
plt.scatter(x, y)
# 针对每一个 x ,计算出预测的 y 值
pred_y = w * x + b
plt.plot(x, pred_y, c='r')
plt.show()
使用 SKlearn 库来实现线性回归模型
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection
def load_data():
points = np.genfromtxt('data.csv', delimiter=',')
x = points[:, 0].reshape(-1, 1)
y = points[:, 1]
return model_selection.train_test_split(x, y, test_size=0.25, random_state=0)
x_train, x_test, y_train, y_test = load_data()
def LinearRegression(*data):
x_train, x_test, y_train, y_test = data
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)
print('Coefficient: %s, Intercepts: %.2f' % (regr.coef_, regr.intercept_))
print('Residual : %.2f' % np.mean((regr.predict(x_test) - y_test)**2))
print('Scores : %.2f' % regr.score(x_test, y_test))
# 绘图
x = np.r_[x_train,x_test]
y = np.r_[y_train,y_test]
plt.scatter(x, y)
plt.plot(x, regr.predict(x), 'r')
plt.show()
data = load_data()
LinearRegression(*data)
Coefficient: [1.33810372], Intercepts: 7.22
Residual : 137.15
Scores : 0.38