Quellcode durchsuchen

update comment for ch05

Gen TANG vor 2 Jahren
Ursprung
Commit
2d95750b98

+ 13 - 0
ch05_econometrics/README.md

@@ -0,0 +1,13 @@
+
+|代码|说明|
+|---|---|
+|[categorical_variable.ipynb](categorical_variable.ipynb)| 定性特征的处理 |
+|[continuous_variable.ipynb](continuous_variable.ipynb)| 定量特征的处理 |
+|[multicollinearity.ipynb](multicollinearity.ipynb)| 多重共线性问题 |
+|[one\_way_anova.ipynb](one_way_anova.ipynb)| 利用one-way ANOVA检测定量特征与定性特征之间的多重共线性 |
+
+
+
+
+
+

+ 7 - 4
ch05_econometrics/continuous_variable.ipynb

@@ -199,7 +199,7 @@
     }
    ],
    "source": [
-    "# 划分5份\n",
+    "# 将每星期工作时间平均划分5份\n",
     "category5 = range(0, 105, 20)\n",
     "train_data = trans_feature(train_set, category5)\n",
     "test_data = trans_feature(test_set, category5)\n",
@@ -229,6 +229,7 @@
     }
    ],
    "source": [
+    "# 展示模型结果\n",
     "re = evaluation(category5_res, base_res)\n",
     "re.savefig('continous_var_cut_5.png', dpi=200)"
    ]
@@ -275,7 +276,7 @@
     }
    ],
    "source": [
-    "# 划分10份\n",
+    "# 将每星期工作时间平均划分10份\n",
     "category10 = range(0, 105, 10)\n",
     "train_data = trans_feature(train_set, category10)\n",
     "test_data = trans_feature(test_set, category10)\n",
@@ -305,6 +306,7 @@
     }
    ],
    "source": [
+    "# 展示模型结果\n",
     "re = evaluation(category10_res, base_res)\n",
     "re.savefig('continous_var_cut_10.png', dpi=200)"
    ]
@@ -317,7 +319,7 @@
    "source": [
     "def get_category(data):\n",
     "    '''\n",
-    "    基于卡方检验,得到每星期工作时间的最优分段\n",
+    "    基于卡方检验,得到每星期工作时间的最优分段\n",
     "    '''\n",
     "    interval = [data['hours_per_week'].min(), data['hours_per_week'].max()]\n",
     "    _category = do_divide(data, interval)\n",
@@ -330,7 +332,7 @@
     "\n",
     "def do_divide(data, interval):\n",
     "    '''\n",
-    "    使用贪心算法,得到最优的分段\n",
+    "    使用贪心算法,得到最优的分段\n",
     "    '''\n",
     "    category = []\n",
     "    p_value, chi2, index = divide_data(data, interval[0], interval[1])\n",
@@ -427,6 +429,7 @@
     }
    ],
    "source": [
+    "# 展示模型结果\n",
     "re = evaluation(category_chi2_res, base_res)\n",
     "re.savefig('continous_var_cut_chi2.png', dpi=200)"
    ]

+ 6 - 6
ch05_econometrics/multicollinearity.ipynb

@@ -14,7 +14,7 @@
     "import matplotlib.pyplot as plt\n",
     "%matplotlib inline\n",
     "\n",
-    "\n",
+    "# 设置随机数种子,使运行结果可复现\n",
     "np.random.seed(2046)"
    ]
   },
@@ -209,14 +209,14 @@
     }
    ],
    "source": [
-    "# 生成模型数据,其中x1,x2为不相关的变量;x1,x3强相关\n",
+    "# 生成模型数据,其中x1,x2为不相关的特征;x1,x3强相关\n",
     "n = 2\n",
     "data = []\n",
     "for i in range(0, 3):\n",
     "    for j in range(0, 3):\n",
     "        data.append({'x1': i, 'x2': j})\n",
     "data = pd.DataFrame(data * n)\n",
-    "# 生成强相关自变量\n",
+    "# 生成强相关自特征\n",
     "data['x3'] = data['x1'] + np.random.random(9 * n)\n",
     "# 生成被预测值\n",
     "error = 0.1 * np.random.random(9 * n)\n",
@@ -699,7 +699,7 @@
     }
    ],
    "source": [
-    "# 将变量的中心重置为0\n",
+    "# 将特征的中心重置为0\n",
     "Y = data['y']\n",
     "X = data[['x3']]\n",
     "X2 = X ** 2\n",
@@ -770,7 +770,7 @@
     }
    ],
    "source": [
-    "# 使用未归一化的变量建模\n",
+    "# 使用未归一化的特征建模\n",
     "X = pd.concat([X, X2], axis=1, ignore_index=True)\n",
     "X.columns = ['x3', 'x3_squared']\n",
     "X = sm.add_constant(X)\n",
@@ -825,7 +825,7 @@
     }
    ],
    "source": [
-    "# 使用重置之后的变量建模\n",
+    "# 使用重置之后的特征建模\n",
     "X = pd.concat([center_x, center_x2], axis=1, ignore_index=True)\n",
     "X.columns = ['x3_center', 'x3_center_sqaured']\n",
     "X = sm.add_constant(X)\n",

+ 3 - 2
ch05_econometrics/one_way_anova.ipynb

@@ -12,7 +12,7 @@
     "import matplotlib.pyplot as plt\n",
     "%matplotlib inline\n",
     "\n",
-    "\n",
+    "# 设置随机数种子,使运行结果可复现\n",
     "np.random.seed(2046)"
    ]
   },
@@ -181,6 +181,7 @@
     }
    ],
    "source": [
+    "# 生成样例数据\n",
     "d1 = np.random.normal(5, 5, 10)\n",
     "d2 = np.random.normal(5, 5, 10)\n",
     "groups = ['d1'] * 10 + ['d2'] * 10\n",
@@ -239,7 +240,7 @@
     }
    ],
    "source": [
-    "# 计算定量变量A与定性变量B之间的eta squared\n",
+    "# 计算定量特征A与定性特征B之间的eta squared\n",
     "re = sm.OLS.from_formula('A ~ B', data=data).fit()\n",
     "aov_table = sm.stats.anova_lm(re, typ=2)\n",
     "# 打印ANOVA分析结果\n",