Jelajahi Sumber

ch03 finished

Gen TANG 2 tahun lalu
induk
melakukan
a40ef02bd4

+ 21 - 0
ch03_linear/data/simple_example.csv

@@ -0,0 +1,21 @@
+x,y
+10,7.7
+10,9.87
+11,11.18
+12,10.43
+13,12.36
+14,14.15
+15,15.73
+16,16.4
+17,18.86
+18,16.13
+19,18.21
+20,18.37
+21,22.61
+22,19.83
+23,22.67
+24,22.7
+25,25.16
+26,25.55
+27,28.21
+28,28.12

+ 210 - 0
ch03_linear/linear_illusion_ci.ipynb

@@ -0,0 +1,210 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import statsmodels.api as sm\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "\n",
+    "\n",
+    "np.random.seed(4873)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('./data/simple_example.csv')\n",
+    "Y = data[['y']]\n",
+    "X = data[['x']]\n",
+    "# 加入新的随机变量,此变量的系数应为0\n",
+    "X['z'] = np.random.randint(2, size=20)\n",
+    "# 加入常量变量\n",
+    "X = sm.add_constant(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = sm.OLS(Y, X)\n",
+    "re = model.fit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                            OLS Regression Results                            \n",
+      "==============================================================================\n",
+      "Dep. Variable:                      y   R-squared:                       0.963\n",
+      "Model:                            OLS   Adj. R-squared:                  0.959\n",
+      "Method:                 Least Squares   F-statistic:                     222.8\n",
+      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):           6.38e-13\n",
+      "Time:                        15:52:54   Log-Likelihood:                -31.141\n",
+      "No. Observations:                  20   AIC:                             68.28\n",
+      "Df Residuals:                      17   BIC:                             71.27\n",
+      "Df Model:                           2                                         \n",
+      "Covariance Type:            nonrobust                                         \n",
+      "==============================================================================\n",
+      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
+      "------------------------------------------------------------------------------\n",
+      "const         -0.8983      0.953     -0.942      0.359      -2.910       1.113\n",
+      "x              1.0400      0.050     20.722      0.000       0.934       1.146\n",
+      "z             -0.3619      0.571     -0.634      0.535      -1.566       0.843\n",
+      "==============================================================================\n",
+      "Omnibus:                        1.295   Durbin-Watson:                   2.209\n",
+      "Prob(Omnibus):                  0.523   Jarque-Bera (JB):                0.852\n",
+      "Skew:                          -0.061   Prob(JB):                        0.653\n",
+      "Kurtosis:                       1.996   Cond. No.                         66.7\n",
+      "==============================================================================\n",
+      "\n",
+      "Notes:\n",
+      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 整体统计分析结果\n",
+    "print(re.summary())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(5320)\n",
+    "x = np.array(range(0, 20)) / 2\n",
+    "error = np.round(np.random.randn(20), 2)\n",
+    "y = 0.05 * x + error\n",
+    "# 新加入的无关变量z恒等于1\n",
+    "z = np.zeros(20) + 1\n",
+    "data = pd.DataFrame({\"x\": x, \"z\": z, \"y\": y})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                 OLS Regression Results                                \n",
+      "=======================================================================================\n",
+      "Dep. Variable:                      y   R-squared (uncentered):                   0.204\n",
+      "Model:                            OLS   Adj. R-squared (uncentered):              0.162\n",
+      "Method:                 Least Squares   F-statistic:                              4.878\n",
+      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):                      0.0397\n",
+      "Time:                        15:52:54   Log-Likelihood:                         -29.583\n",
+      "No. Observations:                  20   AIC:                                      61.17\n",
+      "Df Residuals:                      19   BIC:                                      62.16\n",
+      "Df Model:                           1                                                  \n",
+      "Covariance Type:            nonrobust                                                  \n",
+      "==============================================================================\n",
+      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
+      "------------------------------------------------------------------------------\n",
+      "x              0.0969      0.044      2.209      0.040       0.005       0.189\n",
+      "==============================================================================\n",
+      "Omnibus:                        0.871   Durbin-Watson:                   2.037\n",
+      "Prob(Omnibus):                  0.647   Jarque-Bera (JB):                0.815\n",
+      "Skew:                           0.275   Prob(JB):                        0.665\n",
+      "Kurtosis:                       2.179   Cond. No.                         1.00\n",
+      "==============================================================================\n",
+      "\n",
+      "Notes:\n",
+      "[1] R² is computed without centering (uncentered) since the model does not contain a constant.\n",
+      "[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 没有多余变量时,x系数符号估计正确,为正\n",
+    "model = sm.OLS(data[['y']], data[['x']])\n",
+    "re = model.fit()\n",
+    "print(re.summary())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                            OLS Regression Results                            \n",
+      "==============================================================================\n",
+      "Dep. Variable:                      y   R-squared:                       0.005\n",
+      "Model:                            OLS   Adj. R-squared:                 -0.050\n",
+      "Method:                 Least Squares   F-statistic:                   0.09171\n",
+      "Date:                Thu, 02 Nov 2023   Prob (F-statistic):              0.765\n",
+      "Time:                        15:52:54   Log-Likelihood:                -27.982\n",
+      "No. Observations:                  20   AIC:                             59.96\n",
+      "Df Residuals:                      18   BIC:                             61.96\n",
+      "Df Model:                           1                                         \n",
+      "Covariance Type:            nonrobust                                         \n",
+      "==============================================================================\n",
+      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
+      "------------------------------------------------------------------------------\n",
+      "x             -0.0243      0.080     -0.303      0.765      -0.193       0.144\n",
+      "z              0.7873      0.445      1.768      0.094      -0.148       1.723\n",
+      "==============================================================================\n",
+      "Omnibus:                        0.939   Durbin-Watson:                   2.375\n",
+      "Prob(Omnibus):                  0.625   Jarque-Bera (JB):                0.886\n",
+      "Skew:                           0.338   Prob(JB):                        0.642\n",
+      "Kurtosis:                       2.221   Cond. No.                         11.0\n",
+      "==============================================================================\n",
+      "\n",
+      "Notes:\n",
+      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model1 = sm.OLS(data[['y']], data[['x', 'z']])\n",
+    "re1 = model1.fit()\n",
+    "print(re1.summary())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

File diff ditekan karena terlalu besar
+ 56 - 0
ch03_linear/linear_illusion_reg.ipynb


File diff ditekan karena terlalu besar
+ 214 - 0
ch03_linear/linear_ml.ipynb


File diff ditekan karena terlalu besar
+ 135 - 0
ch03_linear/linear_overfitting.ipynb


File diff ditekan karena terlalu besar
+ 438 - 0
ch03_linear/linear_stat.ipynb


Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini