In [1]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd


np.random.seed(4873)

In [2]:
data = pd.read_csv('./data/simple_example.csv')
Y = data[['y']]
X = data[['x']]
# 加入新的随机变量，此变量的系数应为0
X['z'] = np.random.randint(2, size=20)
# 加入常量变量
X = sm.add_constant(X)

In [3]:
# 构建并训练模型
model = sm.OLS(Y, X)
re = model.fit()

In [4]:
# 整体统计分析结果
print(re.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.959
Method:                 Least Squares   F-statistic:                     222.8
Date:                Fri, 08 Dec 2023   Prob (F-statistic):           6.38e-13
Time:                        10:39:38   Log-Likelihood:                -31.141
No. Observations:                  20   AIC:                             68.28
Df Residuals:                      17   BIC:                             71.27
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8983      0.953     -0.942      0.3

In [5]:
# 生成模型数据
np.random.seed(5320)
x = np.array(range(0, 20)) / 2
error = np.round(np.random.randn(20), 2)
y = 0.05 * x + error
# 新加入的无关变量z恒等于1
z = np.zeros(20) + 1
data = pd.DataFrame({"x": x, "z": z, "y": y})

In [6]:
# 没有多余变量时，x系数符号估计正确，为正数
model = sm.OLS(data[['y']], data[['x']])
re = model.fit()
print(re.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.204
Model:                            OLS   Adj. R-squared (uncentered):              0.162
Method:                 Least Squares   F-statistic:                              4.878
Date:                Fri, 08 Dec 2023   Prob (F-statistic):                      0.0397
Time:                        10:39:38   Log-Likelihood:                         -29.583
No. Observations:                  20   AIC:                                      61.17
Df Residuals:                      19   BIC:                                      62.16
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [7]:
# 加入多余变量时，x系数符号估计错误，为负数
model1 = sm.OLS(data[['y']], data[['x', 'z']])
re1 = model1.fit()
print(re1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                 -0.050
Method:                 Least Squares   F-statistic:                   0.09171
Date:                Fri, 08 Dec 2023   Prob (F-statistic):              0.765
Time:                        10:39:38   Log-Likelihood:                -27.982
No. Observations:                  20   AIC:                             59.96
Df Residuals:                      18   BIC:                             61.96
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x             -0.0243      0.080     -0.303      0.7