Bläddra i källkod

update comment for ch03

Gen TANG 2 år sedan
förälder
incheckning
1e42e76ad4

+ 10 - 7
ch03_linear/linear_illusion_ci.ipynb

@@ -36,6 +36,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 构建并训练模型\n",
     "model = sm.OLS(Y, X)\n",
     "re = model.fit()"
    ]
@@ -54,8 +55,8 @@
       "Dep. Variable:                      y   R-squared:                       0.963\n",
       "Model:                            OLS   Adj. R-squared:                  0.959\n",
       "Method:                 Least Squares   F-statistic:                     222.8\n",
-      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):           6.38e-13\n",
-      "Time:                        15:52:54   Log-Likelihood:                -31.141\n",
+      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):           6.38e-13\n",
+      "Time:                        10:39:38   Log-Likelihood:                -31.141\n",
       "No. Observations:                  20   AIC:                             68.28\n",
       "Df Residuals:                      17   BIC:                             71.27\n",
       "Df Model:                           2                                         \n",
@@ -89,6 +90,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 生成模型数据\n",
     "np.random.seed(5320)\n",
     "x = np.array(range(0, 20)) / 2\n",
     "error = np.round(np.random.randn(20), 2)\n",
@@ -112,8 +114,8 @@
       "Dep. Variable:                      y   R-squared (uncentered):                   0.204\n",
       "Model:                            OLS   Adj. R-squared (uncentered):              0.162\n",
       "Method:                 Least Squares   F-statistic:                              4.878\n",
-      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):                      0.0397\n",
-      "Time:                        15:52:54   Log-Likelihood:                         -29.583\n",
+      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):                      0.0397\n",
+      "Time:                        10:39:38   Log-Likelihood:                         -29.583\n",
       "No. Observations:                  20   AIC:                                      61.17\n",
       "Df Residuals:                      19   BIC:                                      62.16\n",
       "Df Model:                           1                                                  \n",
@@ -136,7 +138,7 @@
     }
    ],
    "source": [
-    "# 没有多余变量时,x系数符号估计正确,为正\n",
+    "# 没有多余变量时,x系数符号估计正确,为正\n",
     "model = sm.OLS(data[['y']], data[['x']])\n",
     "re = model.fit()\n",
     "print(re.summary())"
@@ -156,8 +158,8 @@
       "Dep. Variable:                      y   R-squared:                       0.005\n",
       "Model:                            OLS   Adj. R-squared:                 -0.050\n",
       "Method:                 Least Squares   F-statistic:                   0.09171\n",
-      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):              0.765\n",
-      "Time:                        15:52:54   Log-Likelihood:                -27.982\n",
+      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):              0.765\n",
+      "Time:                        10:39:38   Log-Likelihood:                -27.982\n",
       "No. Observations:                  20   AIC:                             59.96\n",
       "Df Residuals:                      18   BIC:                             61.96\n",
       "Df Model:                           1                                         \n",
@@ -180,6 +182,7 @@
     }
    ],
    "source": [
+    "# 加入多余变量时,x系数符号估计错误,为负数\n",
     "model1 = sm.OLS(data[['y']], data[['x', 'z']])\n",
     "re1 = model1.fit()\n",
     "print(re1.summary())"

+ 12 - 4
ch03_linear/linear_illusion_reg.ipynb

@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -20,7 +22,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "data = pd.read_csv('./data/simple_example.csv')\n",
@@ -34,13 +38,16 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "alphas = np.linspace(-6, -1, 100)\n",
     "coefs = []\n",
     "for alpha in alphas:\n",
-    "    # 在scikit-learn的实现中,惩罚并不包括截距项,因此手动添加\n",
+    "    # 如果直接使用scikit-learn提供的lasso模型,则模型惩罚并不包括截距项。\n",
+    "    # 因此我们在模型中手动添加截距项const\n",
     "    model = linear_model.Lasso(alpha=np.exp(alpha), fit_intercept=False)\n",
     "    model.fit(X[['x', 'z', 'const']], Y)\n",
     "    coefs.append(model.coef_.tolist())\n",
@@ -77,6 +84,7 @@
     "ax.plot(alphas, coefs[:, 0], 'b-.', label='x的参数a')\n",
     "ax.plot(alphas, coefs[:, 1], 'r:', label='z的参数b')\n",
     "ax.plot(alphas, coefs[:, 2], 'g', label='const的参数c')\n",
+    "# 设置图例的样式\n",
     "legend = plt.legend(loc=4, shadow=True)\n",
     "legend.get_frame().set_facecolor(\"#6F93AE\")\n",
     "ax.set_yticks(np.arange(-1.0, 1.3, 0.2))\n",

+ 12 - 3
ch03_linear/linear_ml.ipynb

@@ -3,16 +3,21 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
+    "# 安装第三方库\n",
     "!pip install scikit-learn pandas matplotlib numpy"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -187,6 +192,7 @@
     }
    ],
    "source": [
+    "# 读取并展示数据\n",
     "data = pd.read_csv('./data/simple_example.csv')\n",
     "data"
    ]
@@ -194,7 +200,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "features = ['x']\n",
@@ -294,6 +302,7 @@
     "    label = f'预测值: $y = {model.coef_.item():.3f}x$ - {abs(model.intercept_.item()):.3f}'\n",
     "# 画线图,用红色线条表示模型结果\n",
     "ax.plot(data[features], model.predict(data[features]), color='r', label=label)\n",
+    "# 设置图例的样式\n",
     "legend = plt.legend(shadow=True)\n",
     "legend.get_frame().set_facecolor('#6F93AE')\n",
     "# 显示均方差和决定系数\n",

+ 24 - 14
ch03_linear/linear_overfitting.ipynb

@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -17,7 +19,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "data = pd.read_csv('./data/simple_example.csv')\n",
@@ -31,11 +35,13 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def evaluate_model(model, test_data, features, labels, featurizer):\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    计算线性模型的均方差和决定系数\n",
     "\n",
     "    参数\n",
@@ -53,7 +59,7 @@
     "    error : np.float64,均方差\n",
     "\n",
     "    score : np.float64,决定系数\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    # 均方差(Mean Squared Error),均方差越小越好\n",
     "    error = model.predict(featurizer.fit_transform(test_data[features])) - test_data[labels]\n",
     "    mse = np.mean(error.values ** 2)\n",
@@ -65,11 +71,13 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def train_model(train_data, features, labels, featurizer):\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    利用训练数据,估计模型参数\n",
     "\n",
     "    参数\n",
@@ -83,7 +91,7 @@
     "    返回\n",
     "    ----\n",
     "    model : LinearRegression, 训练好的线性模型\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    # 创建一个线性回归模型\n",
     "    model = linear_model.LinearRegression(fit_intercept=False)\n",
     "    # 训练模型,估计模型参数\n",
@@ -94,13 +102,15 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def visualize_model(model, featurizer, data, features, labels, evaluation):\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    模型可视化\n",
-    "    \"\"\"\n",
+    "    '''\n",
     "    # 为在Matplotlib中显示中文,设置特殊字体\n",
     "    plt.rcParams['font.sans-serif'] = ['SimHei']\n",
     "    plt.rcParams['axes.unicode_minus'] = False\n",
@@ -114,12 +124,12 @@
     "    return plt\n",
     "\n",
     "def _visualization(ax, data, model, featurizer, evaluation, features, labels):\n",
-    "    \"\"\"\n",
-    "    \"\"\"\n",
+    "    '''\n",
+    "    '''\n",
     "    # 画点图,用蓝色圆点表示原始数据\n",
     "    ax.scatter(data[features], data[labels], color='b')\n",
     "    # 画线图,用红色线条表示模型结果\n",
-    "    ax.plot(data[features], model.predict(featurizer.fit_transform(data[features])), color=\"r\")\n",
+    "    ax.plot(data[features], model.predict(featurizer.fit_transform(data[features])), color='r')\n",
     "    # 显示均方差和决定系数\n",
     "    ax.text(0.01, 0.99, f'均方差:{evaluation[0]:.3f}\\n决定系数:{evaluation[1]:.3f}',\n",
     "            style='italic', verticalalignment='top', horizontalalignment='left',\n",

+ 11 - 4
ch03_linear/linear_stat.ipynb

@@ -3,16 +3,21 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
+    "# 安装第三方库\n",
     "!pip install statsmodels"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -188,6 +193,7 @@
     }
    ],
    "source": [
+    "# 读取并展示数据\n",
     "data = pd.read_csv('./data/simple_example.csv')\n",
     "data"
    ]
@@ -275,7 +281,7 @@
     }
    ],
    "source": [
-    "# 构建模型\n",
+    "# 构建并训练模型\n",
     "model = sm.OLS(Y, X)\n",
     "re = model.fit()\n",
     "re"
@@ -422,7 +428,7 @@
    ],
    "source": [
     "# const并不显著,去掉这个常量变量\n",
-    "# 构建模型\n",
+    "# 构建新的模型\n",
     "model_new = sm.OLS(Y, data[features])\n",
     "re_new = model_new.fit()\n",
     "# 输出新模型的分析结果\n",
@@ -468,6 +474,7 @@
     "ax.plot(data[features], re_new.predict(data[features]), color='r',\n",
     "        label=f'预测值: $y = {re_new.params[features].item():.3f}x$')\n",
     "ax.plot(data[features], pre_low, 'r--')\n",
+    "# 设置图例的样式\n",
     "legend = plt.legend(shadow=True)\n",
     "legend.get_frame().set_facecolor('#6F93AE')\n",
     "plt.savefig('linear_stat.png', dpi=200)\n",