简单线性回归(最小二乘法)

0.引入依赖

import numpy as np
import matplotlib.pyplot as plt

1.导入数据

points = np.genfromtxt('data.csv', delimiter=',')
points[0, 0]
32.50234526945303
# 提取 points 中的两列数据,分别作为 x, y
x = points[:, 0]
y = points[:, 1]

# 用 plt 画出散点图
plt.scatter(x, y)
plt.show()

output_6_0.png

2.定义损失函数

求解 $w$ 和 $b$,使得 $E(w, b) = \sum_{i=1}^{m}(y_{i} - wx_{i} -b)^{2}$ 最小化的过程,称为线性回归模型的“最小二乘参数估计”。

# 损失函数是系数的函数,另外还需要传输数据 points(x,y)

def computerCost(w, b, points):
    total_cost = 0.0
    M = len(points)
    
    # 逐点计算平方损失误差,然后求平均数
    for i in range(M):
        x = points[i, 0]
        y = points[i, 1]
        total_cost += (y - w * x - b)**2
    return total_cost / M
     

3.算法拟合函数

$w = \frac{\sum_{i=1}^{m}y_{i}(x_{i} - \bar{x})}{\sum_{i=1}^{m}x_{i}^{2} - \frac{1}{m}(\sum_{i=1}^{m}x_{i})^{2}}$

$b = \frac{1}{m}\sum_{i=1}^{m}(y_{i} - wx_{i})$

$\bar{x} = \frac{1}{m}\sum_{i=1}^{m}x_{i}$

# 先定义求均值的函数

def average(data):
    return sum(data)/len(data)

# 定义核心拟合函数

def fit(points):
    M = len(points)
    
    x_bar = average(points[:, 0]) # x = points[:,0]
    
    sum_yx, sum_x2, sum_delta = 0, 0, 0
    
    for i in range(M):
        x = points[i, 0]
        y = points[i, 1]
        sum_yx += y * (x - x_bar)
        sum_x2 += x ** 2
    
    # 根据公式计算 w
    w = sum_yx / (sum_x2 - M * (x_bar ** 2))
    
    # 计算 b
    for i in range(M):
        x = points[i, 0]
        y = points[i, 1]
        sum_delta += (y - w * x)
    b = sum_delta / M
    
    return w, b
        

4.测试

w, b = fit(points)

cost = computerCost(w, b, points)

print('W is : ', w)
print('B is : ', b)
print('Cost : ', cost)
W is :  1.3224310227553846
B is :  7.991020982269173
Cost :  110.25738346621313

5.画出拟合曲线

plt.scatter(x, y)

# 针对每一个 x ,计算出预测的 y 值
pred_y = w * x + b

plt.plot(x, pred_y, c='r')
plt.show()

output_18_0.png

使用 SKlearn 库来实现线性回归模型

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection

def load_data():
    points = np.genfromtxt('data.csv', delimiter=',')
    x = points[:, 0].reshape(-1, 1)
    y = points[:, 1]
    return model_selection.train_test_split(x, y, test_size=0.25, random_state=0)
x_train, x_test, y_train, y_test = load_data()
def LinearRegression(*data):
    x_train, x_test, y_train, y_test = data
    regr = linear_model.LinearRegression()
    regr.fit(x_train, y_train)
    print('Coefficient: %s, Intercepts: %.2f' % (regr.coef_, regr.intercept_))
    print('Residual : %.2f' % np.mean((regr.predict(x_test) - y_test)**2))
    print('Scores : %.2f' % regr.score(x_test, y_test))
    
    # 绘图
    x = np.r_[x_train,x_test]
    y = np.r_[y_train,y_test]
    plt.scatter(x, y)
    plt.plot(x, regr.predict(x), 'r')
    plt.show()
data = load_data()
LinearRegression(*data)
Coefficient: [1.33810372], Intercepts: 7.22
Residual : 137.15
Scores : 0.38

png

Last modification:September 27, 2019
如果觉得我的文章对你有用,请随意赞赏