{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import statsmodels.api as sm\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "\n", "\n", "np.random.seed(4873)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('./data/simple_example.csv')\n", "Y = data[['y']]\n", "X = data[['x']]\n", "# 加入新的随机变量,此变量的系数应为0\n", "X['z'] = np.random.randint(2, size=20)\n", "# 加入常量变量\n", "X = sm.add_constant(X)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# 构建并训练模型\n", "model = sm.OLS(Y, X)\n", "re = model.fit()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: y R-squared: 0.963\n", "Model: OLS Adj. R-squared: 0.959\n", "Method: Least Squares F-statistic: 222.8\n", "Date: Fri, 08 Dec 2023 Prob (F-statistic): 6.38e-13\n", "Time: 10:39:38 Log-Likelihood: -31.141\n", "No. Observations: 20 AIC: 68.28\n", "Df Residuals: 17 BIC: 71.27\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -0.8983 0.953 -0.942 0.359 -2.910 1.113\n", "x 1.0400 0.050 20.722 0.000 0.934 1.146\n", "z -0.3619 0.571 -0.634 0.535 -1.566 0.843\n", "==============================================================================\n", "Omnibus: 1.295 Durbin-Watson: 2.209\n", "Prob(Omnibus): 0.523 Jarque-Bera (JB): 0.852\n", "Skew: -0.061 Prob(JB): 0.653\n", "Kurtosis: 1.996 Cond. No. 66.7\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "# 整体统计分析结果\n", "print(re.summary())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# 生成模型数据\n", "np.random.seed(5320)\n", "x = np.array(range(0, 20)) / 2\n", "error = np.round(np.random.randn(20), 2)\n", "y = 0.05 * x + error\n", "# 新加入的无关变量z恒等于1\n", "z = np.zeros(20) + 1\n", "data = pd.DataFrame({\"x\": x, \"z\": z, \"y\": y})" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "=======================================================================================\n", "Dep. Variable: y R-squared (uncentered): 0.204\n", "Model: OLS Adj. R-squared (uncentered): 0.162\n", "Method: Least Squares F-statistic: 4.878\n", "Date: Fri, 08 Dec 2023 Prob (F-statistic): 0.0397\n", "Time: 10:39:38 Log-Likelihood: -29.583\n", "No. Observations: 20 AIC: 61.17\n", "Df Residuals: 19 BIC: 62.16\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "x 0.0969 0.044 2.209 0.040 0.005 0.189\n", "==============================================================================\n", "Omnibus: 0.871 Durbin-Watson: 2.037\n", "Prob(Omnibus): 0.647 Jarque-Bera (JB): 0.815\n", "Skew: 0.275 Prob(JB): 0.665\n", "Kurtosis: 2.179 Cond. No. 1.00\n", "==============================================================================\n", "\n", "Notes:\n", "[1] R² is computed without centering (uncentered) since the model does not contain a constant.\n", "[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "# 没有多余变量时,x系数符号估计正确,为正数\n", "model = sm.OLS(data[['y']], data[['x']])\n", "re = model.fit()\n", "print(re.summary())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: y R-squared: 0.005\n", "Model: OLS Adj. R-squared: -0.050\n", "Method: Least Squares F-statistic: 0.09171\n", "Date: Fri, 08 Dec 2023 Prob (F-statistic): 0.765\n", "Time: 10:39:38 Log-Likelihood: -27.982\n", "No. Observations: 20 AIC: 59.96\n", "Df Residuals: 18 BIC: 61.96\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "x -0.0243 0.080 -0.303 0.765 -0.193 0.144\n", "z 0.7873 0.445 1.768 0.094 -0.148 1.723\n", "==============================================================================\n", "Omnibus: 0.939 Durbin-Watson: 2.375\n", "Prob(Omnibus): 0.625 Jarque-Bera (JB): 0.886\n", "Skew: 0.338 Prob(JB): 0.642\n", "Kurtosis: 2.221 Cond. No. 11.0\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "# 加入多余变量时,x系数符号估计错误,为负数\n", "model1 = sm.OLS(data[['y']], data[['x', 'z']])\n", "re1 = model1.fit()\n", "print(re1.summary())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }