1 年之前 · cef2d53f7d
--- a/prerequisite/linear/generate_data.py
+++ b/prerequisite/linear/generate_data.py
@@ -0,0 +1,56 @@
 
				+# -*- coding: UTF-8 -*-
			
 
				+"""
			
 
				+此脚本用于随机生成线性回归模型的训练数据
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+import matplotlib.pyplot as plt
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+def generate_data():
			
 
				+    """
			
 
				+    随机生成数据
			
 
				+    """
			
 
				+    # 规定随机数生成的种子
			
 
				+    np.random.seed(4889)
			
 
				+    # Python2和Python3的range并不兼容，所以使用list(range(10, 29))
			
 
				+    x = np.array([10] + list(range(10, 29)))
			
 
				+    error = np.round(np.random.randn(20), 2)
			
 
				+    y = x + error
			
 
				+    return pd.DataFrame({"x": x, "y": y})
			
 
				+
			
 
				+
			
 
				+def visualize_data(data):
			
 
				+    """
			
 
				+    数据可视化
			
 
				+    """
			
 
				+    # 创建一个图形框，在里面只画一幅图
			
 
				+    fig = plt.figure(figsize=(6, 6), dpi=80)
			
 
				+    ax = fig.add_subplot(111)
			
 
				+    # 设置坐标轴
			
 
				+    ax.set_xlabel("$x$")
			
 
				+    ax.set_xticks(range(10, 31, 5))
			
 
				+    ax.set_ylabel("$y$")
			
 
				+    ax.set_yticks(range(10, 31, 5))
			
 
				+    # 画点图，点的颜色为蓝色
			
 
				+    ax.scatter(data.x, data.y, color="b",
			
 
				+               label="$y = x + \epsilon$")
			
 
				+    plt.legend(shadow=True)
			
 
				+    # 展示上面所画的图片。图片将阻断程序的运行，直至所有的图片被关闭
			
 
				+    # 在Python shell里面，可以设置参数"block=False"，使阻断失效。
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    data = generate_data()
			
 
				+    home_path = os.path.dirname(os.path.abspath(__file__))
			
 
				+    # 存储数据，Windows下的存储路径与Linux并不相同
			
 
				+    if os.name == "nt":
			
 
				+        data.to_csv("%s\\simple_example.csv" % home_path, index=False)
			
 
				+    else:
			
 
				+        data.to_csv("%s/simple_example.csv" % home_path, index=False)
			
 
				+    visualize_data(data)
			
--- a/prerequisite/linear/linear_ml.py
+++ b/prerequisite/linear/linear_ml.py
@@ -0,0 +1,127 @@
 
				+# -*- coding: UTF-8 -*-
			
 
				+"""
			
 
				+此脚本用于实现线性回归模型
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+import numpy as np
			
 
				+import matplotlib.pyplot as plt
			
 
				+import pandas as pd
			
 
				+from sklearn import linear_model
			
 
				+
			
 
				+
			
 
				+def read_data(path):
			
 
				+    """
			
 
				+    使用pandas读取数据
			
 
				+    
			
 
				+    参数
			
 
				+    ----
			
 
				+    path: String，数据的路径
			
 
				+    
			
 
				+    返回
			
 
				+    ----
			
 
				+    data: DataFrame，建模数据
			
 
				+    """
			
 
				+    data = pd.read_csv(path)
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+def train_model(x, y):
			
 
				+    """
			
 
				+    利用训练数据，估计模型参数
			
 
				+    
			
 
				+    参数
			
 
				+    ----
			
 
				+    x: DataFrame，特征
			
 
				+    
			
 
				+    y: DataFrame，标签
			
 
				+    
			
 
				+    返回
			
 
				+    ----
			
 
				+    model : LinearRegression, 训练好的线性模型
			
 
				+    """
			
 
				+    # 创建一个线性回归模型
			
 
				+    model = linear_model.LinearRegression()
			
 
				+    # 训练模型，估计模型参数
			
 
				+    model.fit(x, y)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def evaluate_model(model, x, y):
			
 
				+    """
			
 
				+    计算线性模型的均方差和决定系数
			
 
				+    
			
 
				+    参数
			
 
				+    ----
			
 
				+    model : LinearRegression, 训练完成的线性模型
			
 
				+    
			
 
				+    x: DataFrame，特征
			
 
				+    
			
 
				+    y: DataFrame，标签
			
 
				+    
			
 
				+    返回
			
 
				+    ----
			
 
				+    mse : np.float64，均方差
			
 
				+    
			
 
				+    score : np.float64，决定系数
			
 
				+    """
			
 
				+    # 均方差(The mean squared error)，均方差越小越好
			
 
				+    mse = np.mean(
			
 
				+        (model.predict(x) - y) ** 2)
			
 
				+    # 决定系数(Coefficient of determination)，决定系数越接近1越好
			
 
				+    score = model.score(x, y)
			
 
				+    return mse, score
			
 
				+
			
 
				+
			
 
				+def visualize_model(model, x, y):
			
 
				+    """
			
 
				+    模型可视化
			
 
				+    """
			
 
				+    # 创建一个图形框
			
 
				+    fig = plt.figure(figsize=(6, 6), dpi=80)
			
 
				+    # 在图形框里只画一幅图
			
 
				+    ax = fig.add_subplot(111)
			
 
				+    ax.set_xlabel('$x$')
			
 
				+    ax.set_ylabel('$y$')
			
 
				+    # 画点图，用蓝色圆点表示原始数据
			
 
				+    ax.scatter(x, y, color='b')
			
 
				+    # 根据截距的正负，打印不同的标签
			
 
				+    ax.plot(x, model.predict(x), color='r',
			
 
				+            label=u'$y = %.3fx$ + %.3f' % (model.coef_, model.intercept_))
			
 
				+    plt.legend(shadow=True)
			
 
				+    # 展示上面所画的图片。图片将阻断程序的运行，直至所有的图片被关闭
			
 
				+    # 在Python shell里面，可以设置参数"block=False"，使阻断失效。
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+def run_model(data):
			
 
				+    """
			
 
				+    线性回归模型建模步骤展示
			
 
				+
			
 
				+    参数
			
 
				+    ----
			
 
				+    data : DataFrame，建模数据
			
 
				+    """
			
 
				+    features = ["x"]
			
 
				+    label = ["y"]
			
 
				+    # 产生并训练模型
			
 
				+    model = train_model(data[features], data[label])
			
 
				+    # 评价模型效果
			
 
				+    mse, score = evaluate_model(model, data[features], data[label])
			
 
				+    print("MSE is %f" % mse)
			
 
				+    print("R2 is %f" % score)
			
 
				+    # 图形化模型结果
			
 
				+    visualize_model(model, data[features], data[label])
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    home_path = os.path.dirname(os.path.abspath(__file__))
			
 
				+    # Windows下的存储路径与Linux并不相同
			
 
				+    if os.name == "nt":
			
 
				+        data_path = "%s\\simple_example.csv" % home_path
			
 
				+    else:
			
 
				+        data_path = "%s/simple_example.csv" % home_path
			
 
				+    data = read_data(data_path)
			
 
				+    run_model(data)
			
--- a/prerequisite/linear/ols_vs_lad.py
+++ b/prerequisite/linear/ols_vs_lad.py
@@ -0,0 +1,89 @@
 
				+# -*- coding: UTF-8 -*-
			
 
				+"""
			
 
				+此脚本用于比较LAD线性回归和OLS线性回归
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+import statsmodels.api as sm
			
 
				+from sklearn import linear_model
			
 
				+from statsmodels.regression.quantile_regression import QuantReg
			
 
				+import matplotlib.pyplot as plt
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+def generate_data():
			
 
				+    """
			
 
				+    随机生成数据
			
 
				+    """
			
 
				+    np.random.seed(4889)
			
 
				+    # Python2和Python3的range并不兼容，所以使用list(range(10, 29))
			
 
				+    x = np.array([10] + list(range(10, 29)))
			
 
				+    error = np.round(np.random.randn(20), 2)
			
 
				+    y = x + error
			
 
				+    # 增加异常点
			
 
				+    x = np.append(x, 29)
			
 
				+    y = np.append(y, 29 * 10)
			
 
				+    return pd.DataFrame({"x": x, "y": y})
			
 
				+
			
 
				+
			
 
				+def train_OLS(x, y):
			
 
				+    """
			
 
				+    训练OLS线性回归模型，并返回模型预测值
			
 
				+    """
			
 
				+    model = linear_model.LinearRegression()
			
 
				+    model.fit(x, y)
			
 
				+    re = model.predict(x)
			
 
				+    return re
			
 
				+
			
 
				+
			
 
				+def train_LAD(x, y):
			
 
				+    """
			
 
				+    训练LAD线性回归模型，并返回模型预测值
			
 
				+    """
			
 
				+    X = sm.add_constant(x)
			
 
				+    model = QuantReg(y, X)
			
 
				+    model = model.fit(q=0.5)
			
 
				+    re = model.predict(X)
			
 
				+    return re
			
 
				+    
			
 
				+    
			
 
				+def visualize_model(x, y, ols, lad):
			
 
				+    """
			
 
				+    模型结果可视化
			
 
				+    """
			
 
				+    # 创建一个图形框
			
 
				+    fig = plt.figure(figsize=(6, 6), dpi=80)
			
 
				+    # 在图形框里只画一幅图
			
 
				+    ax = fig.add_subplot(111)
			
 
				+    # 设置坐标轴
			
 
				+    ax.set_xlabel("$x$")
			
 
				+    ax.set_xticks(range(10, 31, 5))
			
 
				+    ax.set_ylabel("$y$")
			
 
				+    # 画点图，点的颜色为蓝色，半透明
			
 
				+    ax.scatter(x, y, color="b", alpha=0.4)
			
 
				+    # 将模型结果可视化出来
			
 
				+    # 用红色虚线表示OLS线性回归模型的结果
			
 
				+    ax.plot(x, ols, 'r--', label="OLS")
			
 
				+    # 用黑色实线表示LAD线性回归模型的结果
			
 
				+    ax.plot(x, lad, 'k', label="LAD")
			
 
				+    plt.legend(shadow=True)
			
 
				+    # 展示上面所画的图片。图片将阻断程序的运行，直至所有的图片被关闭
			
 
				+    # 在Python shell里面，可以设置参数"block=False"，使阻断失效
			
 
				+    plt.show()
			
 
				+
			
 
				+
			
 
				+def OLS_vs_LAD(data):
			
 
				+    """
			
 
				+    比较OLS模型和LAD模型的差异
			
 
				+    """
			
 
				+    features = ["x"]
			
 
				+    label = ["y"]
			
 
				+    ols = train_OLS(data[features], data[label])
			
 
				+    lad = train_LAD(data[features], data[label])
			
 
				+    visualize_model(data[features], data[label], ols, lad)
			
 
				+
			
 
				+    
			
 
				+if __name__ == "__main__":
			
 
				+    data = generate_data()
			
 
				+    OLS_vs_LAD(data)
			
--- a/prerequisite/linear/pdf/1_机器学习眼中的线性回归模型.pdf
+++ b/prerequisite/linear/pdf/1_机器学习眼中的线性回归模型.pdf
--- a/prerequisite/linear/simple_example.csv
+++ b/prerequisite/linear/simple_example.csv
@@ -0,0 +1,21 @@
 
				+x,y
			
 
				+10,7.7
			
 
				+10,9.87
			
 
				+11,11.18
			
 
				+12,10.43
			
 
				+13,12.36
			
 
				+14,14.15
			
 
				+15,15.73
			
 
				+16,16.4
			
 
				+17,18.86
			
 
				+18,16.13
			
 
				+19,18.21
			
 
				+20,18.37
			
 
				+21,22.61
			
 
				+22,19.83
			
 
				+23,22.67
			
 
				+24,22.7
			
 
				+25,25.16
			
 
				+26,25.55
			
 
				+27,28.21
			
 
				+28,28.12
			
--- a/prerequisite/math/pdf/1_矩阵和向量空间：数据的基本表示形式.pdf
+++ b/prerequisite/math/pdf/1_矩阵和向量空间：数据的基本表示形式.pdf
--- a/prerequisite/math/pdf/2_概率：量化随机.pdf
+++ b/prerequisite/math/pdf/2_概率：量化随机.pdf
--- a/prerequisite/math/pdf/3_微积分：变化速率与累积效应.pdf
+++ b/prerequisite/math/pdf/3_微积分：变化速率与累积效应.pdf
--- a/prerequisite/math/pdf/4_如何量化信息的价值：条件概率.pdf
+++ b/prerequisite/math/pdf/4_如何量化信息的价值：条件概率.pdf