2 år sedan · 1e42e76ad4
--- a/ch03_linear/linear_illusion_ci.ipynb
+++ b/ch03_linear/linear_illusion_ci.ipynb
@@ -36,6 +36,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				+    "# 构建并训练模型\n",
			
 
				     "model = sm.OLS(Y, X)\n",
			
 
				     "re = model.fit()"
			
 
				    ]
			
@@ -54,8 +55,8 @@
 
				       "Dep. Variable:                      y   R-squared:                       0.963\n",
			
 
				       "Model:                            OLS   Adj. R-squared:                  0.959\n",
			
 
				       "Method:                 Least Squares   F-statistic:                     222.8\n",
			
 
				-      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):           6.38e-13\n",
			
 
				-      "Time:                        15:52:54   Log-Likelihood:                -31.141\n",
			
 
				+      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):           6.38e-13\n",
			
 
				+      "Time:                        10:39:38   Log-Likelihood:                -31.141\n",
			
 
				       "No. Observations:                  20   AIC:                             68.28\n",
			
 
				       "Df Residuals:                      17   BIC:                             71.27\n",
			
 
				       "Df Model:                           2                                         \n",
			
@@ -89,6 +90,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				+    "# 生成模型数据\n",
			
 
				     "np.random.seed(5320)\n",
			
 
				     "x = np.array(range(0, 20)) / 2\n",
			
 
				     "error = np.round(np.random.randn(20), 2)\n",
			
@@ -112,8 +114,8 @@
 
				       "Dep. Variable:                      y   R-squared (uncentered):                   0.204\n",
			
 
				       "Model:                            OLS   Adj. R-squared (uncentered):              0.162\n",
			
 
				       "Method:                 Least Squares   F-statistic:                              4.878\n",
			
 
				-      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):                      0.0397\n",
			
 
				-      "Time:                        15:52:54   Log-Likelihood:                         -29.583\n",
			
 
				+      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):                      0.0397\n",
			
 
				+      "Time:                        10:39:38   Log-Likelihood:                         -29.583\n",
			
 
				       "No. Observations:                  20   AIC:                                      61.17\n",
			
 
				       "Df Residuals:                      19   BIC:                                      62.16\n",
			
 
				       "Df Model:                           1                                                  \n",
			
@@ -136,7 +138,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 没有多余变量时，x系数符号估计正确，为正\n",
			
 
				+    "# 没有多余变量时，x系数符号估计正确，为正数\n",
			
 
				     "model = sm.OLS(data[['y']], data[['x']])\n",
			
 
				     "re = model.fit()\n",
			
 
				     "print(re.summary())"
			
@@ -156,8 +158,8 @@
 
				       "Dep. Variable:                      y   R-squared:                       0.005\n",
			
 
				       "Model:                            OLS   Adj. R-squared:                 -0.050\n",
			
 
				       "Method:                 Least Squares   F-statistic:                   0.09171\n",
			
 
				-      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):              0.765\n",
			
 
				-      "Time:                        15:52:54   Log-Likelihood:                -27.982\n",
			
 
				+      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):              0.765\n",
			
 
				+      "Time:                        10:39:38   Log-Likelihood:                -27.982\n",
			
 
				       "No. Observations:                  20   AIC:                             59.96\n",
			
 
				       "Df Residuals:                      18   BIC:                             61.96\n",
			
 
				       "Df Model:                           1                                         \n",
			
@@ -180,6 +182,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				+    "# 加入多余变量时，x系数符号估计错误，为负数\n",
			
 
				     "model1 = sm.OLS(data[['y']], data[['x', 'z']])\n",
			
 
				     "re1 = model1.fit()\n",
			
 
				     "print(re1.summary())"
			
--- a/ch03_linear/linear_illusion_reg.ipynb
+++ b/ch03_linear/linear_illusion_reg.ipynb
@@ -3,7 +3,9 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 1,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "import numpy as np\n",
			
@@ -20,7 +22,9 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "data = pd.read_csv('./data/simple_example.csv')\n",
			
@@ -34,13 +38,16 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 3,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "alphas = np.linspace(-6, -1, 100)\n",
			
 
				     "coefs = []\n",
			
 
				     "for alpha in alphas:\n",
			
 
				-    "    # 在scikit-learn的实现中，惩罚并不包括截距项，因此手动添加\n",
			
 
				+    "    # 如果直接使用scikit-learn提供的lasso模型，则模型惩罚并不包括截距项。\n",
			
 
				+    "    # 因此我们在模型中手动添加截距项const\n",
			
 
				     "    model = linear_model.Lasso(alpha=np.exp(alpha), fit_intercept=False)\n",
			
 
				     "    model.fit(X[['x', 'z', 'const']], Y)\n",
			
 
				     "    coefs.append(model.coef_.tolist())\n",
			
@@ -77,6 +84,7 @@
 
				     "ax.plot(alphas, coefs[:, 0], 'b-.', label='x的参数a')\n",
			
 
				     "ax.plot(alphas, coefs[:, 1], 'r:', label='z的参数b')\n",
			
 
				     "ax.plot(alphas, coefs[:, 2], 'g', label='const的参数c')\n",
			
 
				+    "# 设置图例的样式\n",
			
 
				     "legend = plt.legend(loc=4, shadow=True)\n",
			
 
				     "legend.get_frame().set_facecolor(\"#6F93AE\")\n",
			
 
				     "ax.set_yticks(np.arange(-1.0, 1.3, 0.2))\n",
			
--- a/ch03_linear/linear_ml.ipynb
+++ b/ch03_linear/linear_ml.ipynb
@@ -3,16 +3,21 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				+    "# 安装第三方库\n",
			
 
				     "!pip install scikit-learn pandas matplotlib numpy"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "import numpy as np\n",
			
@@ -187,6 +192,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				+    "# 读取并展示数据\n",
			
 
				     "data = pd.read_csv('./data/simple_example.csv')\n",
			
 
				     "data"
			
 
				    ]
			
@@ -194,7 +200,9 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 4,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "features = ['x']\n",
			
@@ -294,6 +302,7 @@
 
				     "    label = f'预测值: $y = {model.coef_.item():.3f}x$ - {abs(model.intercept_.item()):.3f}'\n",
			
 
				     "# 画线图，用红色线条表示模型结果\n",
			
 
				     "ax.plot(data[features], model.predict(data[features]), color='r', label=label)\n",
			
 
				+    "# 设置图例的样式\n",
			
 
				     "legend = plt.legend(shadow=True)\n",
			
 
				     "legend.get_frame().set_facecolor('#6F93AE')\n",
			
 
				     "# 显示均方差和决定系数\n",
			
--- a/ch03_linear/linear_overfitting.ipynb
+++ b/ch03_linear/linear_overfitting.ipynb
@@ -3,7 +3,9 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 1,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "import numpy as np\n",
			
@@ -17,7 +19,9 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "data = pd.read_csv('./data/simple_example.csv')\n",
			
@@ -31,11 +35,13 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 3,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "def evaluate_model(model, test_data, features, labels, featurizer):\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				     "    计算线性模型的均方差和决定系数\n",
			
 
				     "\n",
			
 
				     "    参数\n",
			
@@ -53,7 +59,7 @@
 
				     "    error : np.float64，均方差\n",
			
 
				     "\n",
			
 
				     "    score : np.float64，决定系数\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				     "    # 均方差(Mean Squared Error)，均方差越小越好\n",
			
 
				     "    error = model.predict(featurizer.fit_transform(test_data[features])) - test_data[labels]\n",
			
 
				     "    mse = np.mean(error.values ** 2)\n",
			
@@ -65,11 +71,13 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 4,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "def train_model(train_data, features, labels, featurizer):\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				     "    利用训练数据，估计模型参数\n",
			
 
				     "\n",
			
 
				     "    参数\n",
			
@@ -83,7 +91,7 @@
 
				     "    返回\n",
			
 
				     "    ----\n",
			
 
				     "    model : LinearRegression, 训练好的线性模型\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				     "    # 创建一个线性回归模型\n",
			
 
				     "    model = linear_model.LinearRegression(fit_intercept=False)\n",
			
 
				     "    # 训练模型，估计模型参数\n",
			
@@ -94,13 +102,15 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 5,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "def visualize_model(model, featurizer, data, features, labels, evaluation):\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				     "    模型可视化\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				     "    # 为在Matplotlib中显示中文，设置特殊字体\n",
			
 
				     "    plt.rcParams['font.sans-serif'] = ['SimHei']\n",
			
 
				     "    plt.rcParams['axes.unicode_minus'] = False\n",
			
@@ -114,12 +124,12 @@
 
				     "    return plt\n",
			
 
				     "\n",
			
 
				     "def _visualization(ax, data, model, featurizer, evaluation, features, labels):\n",
			
 
				-    "    \"\"\"\n",
			
 
				-    "    \"\"\"\n",
			
 
				+    "    '''\n",
			
 
				+    "    '''\n",
			
 
				     "    # 画点图，用蓝色圆点表示原始数据\n",
			
 
				     "    ax.scatter(data[features], data[labels], color='b')\n",
			
 
				     "    # 画线图，用红色线条表示模型结果\n",
			
 
				-    "    ax.plot(data[features], model.predict(featurizer.fit_transform(data[features])), color=\"r\")\n",
			
 
				+    "    ax.plot(data[features], model.predict(featurizer.fit_transform(data[features])), color='r')\n",
			
 
				     "    # 显示均方差和决定系数\n",
			
 
				     "    ax.text(0.01, 0.99, f'均方差：{evaluation[0]:.3f}\\n决定系数：{evaluation[1]:.3f}',\n",
			
 
				     "            style='italic', verticalalignment='top', horizontalalignment='left',\n",
			
--- a/ch03_linear/linear_stat.ipynb
+++ b/ch03_linear/linear_stat.ipynb
@@ -3,16 +3,21 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				+    "# 安装第三方库\n",
			
 
				     "!pip install statsmodels"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "metadata": {},
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "import numpy as np\n",
			
@@ -188,6 +193,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				+    "# 读取并展示数据\n",
			
 
				     "data = pd.read_csv('./data/simple_example.csv')\n",
			
 
				     "data"
			
 
				    ]
			
@@ -275,7 +281,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# 构建模型\n",
			
 
				+    "# 构建并训练模型\n",
			
 
				     "model = sm.OLS(Y, X)\n",
			
 
				     "re = model.fit()\n",
			
 
				     "re"
			
@@ -422,7 +428,7 @@
 
				    ],
			
 
				    "source": [
			
 
				     "# const并不显著，去掉这个常量变量\n",
			
 
				-    "# 构建模型\n",
			
 
				+    "# 构建新的模型\n",
			
 
				     "model_new = sm.OLS(Y, data[features])\n",
			
 
				     "re_new = model_new.fit()\n",
			
 
				     "# 输出新模型的分析结果\n",
			
@@ -468,6 +474,7 @@
 
				     "ax.plot(data[features], re_new.predict(data[features]), color='r',\n",
			
 
				     "        label=f'预测值: $y = {re_new.params[features].item():.3f}x$')\n",
			
 
				     "ax.plot(data[features], pre_low, 'r--')\n",
			
 
				+    "# 设置图例的样式\n",
			
 
				     "legend = plt.legend(shadow=True)\n",
			
 
				     "legend.get_frame().set_facecolor('#6F93AE')\n",
			
 
				     "plt.savefig('linear_stat.png', dpi=200)\n",