毕业论文实证

Author

吴诗涛

Published

2023-04-22

1 CFPS数据导入

library(tidyverse)
library(haven)
library(labelled)

load("fam-and-person.RData")

2 数据清洗

从现有的 CFPS 数据中提取本文所需的变量，并对缺失、离群等数据进行剔除和调整。

# 整理户主信息
host_part <- person %>% 
  remove_labels() %>%    # 去除CFPS数据中的标签信息
  transmute(
    pid,
    Age = age,
    Gender = gender,
    Health = ifelse(qp201 == 5, 0, 1),
    Insurance = ifelse(qp605_a_78 == 1, 0, 1),
    Marriage = ifelse(qea0 == 2, 1, 0),
    Education = pmax(cfps2018eduy, cfps2018eduy_im, na.rm = TRUE)
  ) %>% 
  mutate(across(everything(), ~ replace_na(.x, 0)))

# 整理家庭成员信息
person_part <- person %>% 
  remove_labels() %>% 
  transmute(
    fid18 = fid18,
    就业 = ifelse(between(age, 16, 60) & employ != 1, 1, 0),  # 失业为1
    教育 = ifelse(coalesce(cfps2018eduy, cfps2018eduy_im) < 9, 1, 0),
    健康状况 = ifelse(qp201 == 5, 1, 0),
    医疗保险 = ifelse(qp605_a_78 == 1, 1, 0),
    信息获取 = ifelse(qn202 == 0, 1, 0),
    个人学校教育 = ifelse(pd501b < 0, 0, pd501b),  # 过去12个月学校教育支出
    个人校外教育 = ifelse(pd503r < 0, 0, pd503r),  # 过去12个月课外辅导费（元）
    个人教育总支出 = pmax(pd5total, pd5ckp, pd5total_m, na.rm = TRUE),
    个人教育总支出 = ifelse(个人教育总支出 < 0, 0, 个人教育总支出),
    劳动力 = ifelse(between(age, 14, 65), 1, 0)
  ) %>% 
  group_by(fid18) %>% 
  reframe(
    就业 = ifelse(mean(就业, na.rm = TRUE) == 1, 1, 0),
    教育 = ifelse(mean(教育, na.rm = TRUE) == 1, 1, 0),
    健康状况 = ifelse(mean(健康状况, na.rm = TRUE) == 1, 1, 0),
    医疗保险 = ifelse(mean(医疗保险, na.rm = TRUE) == 1, 1, 0),
    信息获取 = ifelse(mean(信息获取, na.rm = TRUE) == 1, 1, 0),
    家庭学校教育 = sum(个人学校教育, na.rm = TRUE),
    家庭校外教育 = sum(个人校外教育, na.rm = TRUE),
    家庭总教育 = sum(个人教育总支出, na.rm = TRUE),
    Ratio = round(sum(劳动力 == 0) / sum(劳动力 == 1), 2),
    Ratio = ifelse(is.infinite(Ratio), 4, Ratio)
  ) %>% 
  mutate(across(everything(), ~ replace_na(.x, 0)))

# 整理家庭信息，合并户主及家庭成员信息
# 得到本文数据
data <- famecon %>% 
  remove_labels() %>% 
  filter(fa1 == 5 | fa101 == 5) %>%   # 农村家庭
  filter(familysize18 > 0) %>% 
  transmute(
    fid18 = fid18,
    Nonagr = case_when(
      fo101 == 1 ~ 1,
      fo101 == 5 ~ 0,  
      .default = NA
    ),
    收入 = ifelse(fincome1_per < quantile(fincome1_per, na.rm = TRUE, prob = .4), 1, 0),
    资产 = ifelse(houseasset_gross == 0, 1, 0),
    饮用水 = ifelse(fa3 %in% c(1, 2, 5, 6, 7), 1, 0),
    做饭燃料 = ifelse(fa4 %in% c(1, 2), 1, 0),
    居住面积 = ifelse(fq801 / familysize18 > 12, 0, 1),
    Size = familysize18,
    Allowance = ifelse(fn100 == 1, 1, 0),
    resp1pid, 
    Region = case_when(
      provcd18 %in% c(11:13, 21, 31:33, 35, 37, 44, 46) ~ "东部", 
      provcd18 %in% c(14, 22:23, 34, 36, 41:43) ~ "中部", 
      provcd18 %in% c(15, 45, 50:54, 61:65) ~ "西部"
    ),
    Region = factor(Region, levels = c("东部", "中部", "西部")),
    provcd18
  ) %>% 
  drop_na() %>% 
  left_join(host_part, by = c("resp1pid" = "pid")) %>% 
  left_join(person_part, by = "fid18") %>% 
  drop_na() %>% 
  mutate(
    就业 = ifelse(Nonagr == 1, 0, 就业),
    Score = round((1/9 * (收入 + 就业 + 资产 + 饮用水 + 做饭燃料 + 居住面积) + 1/12 * (教育 + 健康状况 + 医疗保险 + 信息获取)), 2),
    Educationfee = log(家庭总教育 + 1),
    Schoolfee = log(家庭学校教育 + 1),
    Outfee = log(家庭校外教育 + 1)
  ) %>% 
  filter(!(Score == 0 & Schoolfee == 0)) %>% 
  filter(!(Nonagr == 0 & Schoolfee > quantile(Schoolfee, probs = .95)))

data

# A tibble: 8,216 × 31
    fid18 Nonagr  收入  资产 饮用水 做饭燃料 居住面积  Size Allowance  resp1pid
    <dbl>  <dbl> <dbl> <dbl>  <dbl>    <dbl>    <dbl> <dbl>     <dbl>     <dbl>
 1 100724      1     1     0      1        0        1     4         0 130492103
 2 100733      1     0     0      0        0        1     1         0 130463104
 3 100782      1     0     0      0        0        1     1         0 130517101
 4 100810      0     1     0      0        0        1     4         0 100810551
 5 100879      1     1     0      0        0        0     3         0 130630103
 6 101129      1     1     0      0        0        1     5         1 130896107
 7 101130      1     0     0      0        0        1     1         0 130897103
 8 101274      1     1     0      0        1        1     3         0 140093103
 9 101303      1     0     0      0        0        0     1         0 140122105
10 101711      1     0     0      0        0        1     4         0 140676103
# ℹ 8,206 more rows
# ℹ 21 more variables: Region <fct>, provcd18 <dbl>, Age <dbl>, Gender <dbl>,
#   Health <dbl>, Insurance <dbl>, Marriage <dbl>, Education <dbl>, 就业 <dbl>,
#   教育 <dbl>, 健康状况 <dbl>, 医疗保险 <dbl>, 信息获取 <dbl>,
#   家庭学校教育 <dbl>, 家庭校外教育 <dbl>, 家庭总教育 <dbl>, Ratio <dbl>,
#   Score <dbl>, Educationfee <dbl>, Schoolfee <dbl>, Outfee <dbl>

3 描述性分析

calculate_mean_sd <- function(df, vars){
  df %>%
    summarize(across(all_of(vars),
              list(mean = mean, sd = sd),
              .names = "{col}_{fn}"))
}

data %>% 
  calculate_mean_sd(c("Educationfee", "Nonagr", "Gender",
                      "Age", "Health", "Marriage", "Education",
                      "Insurance", "Allowance", "Size", "Ratio")) %>% 
  pivot_longer(everything(),
               names_sep = "_",
               names_to = c("variable", ".value")) %>% 
  mutate(across(where(is.numeric), ~ round(.x, 2)))

# A tibble: 11 × 3
   variable      mean    sd
   <chr>        <dbl> <dbl>
 1 Educationfee  0.86  2.62
 2 Nonagr        0.5   0.5 
 3 Gender        0.55  0.5 
 4 Age          52.1  14.5 
 5 Health        0.79  0.41
 6 Marriage      0.83  0.38
 7 Education     6.27  4.38
 8 Insurance     0.91  0.29
 9 Allowance     0.55  0.5 
10 Size          3.84  2   
11 Ratio         0.69  1.27

4 不同K值变量描述性分析

4.1 k=0.3

k <- 0.3
data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0)) %>% 
  group_by(Repoverty) %>% 
  calculate_mean_sd(c("Educationfee", "Nonagr", "Gender",
                      "Age", "Health", "Marriage", "Education",
                      "Insurance", "Allowance", "Size", "Ratio")) %>% 
  mutate(across(where(is.numeric), ~ round(.x, 2))) %>% 
  pivot_longer(-1,
             names_sep = "_",
             names_to = c("variable", ".value")) %>% 
  mutate(mean_sd = str_c(mean, "\n（", sd, "）")) %>% 
  pivot_wider(id_cols = variable,
              names_from = Repoverty,
              values_from = mean_sd)

# A tibble: 11 × 3
   variable     `0`               `1`               
   <chr>        <chr>             <chr>             
 1 Educationfee "0.99\n（2.79）"  "0.71\n（2.38）"  
 2 Nonagr       "0.54\n（0.5）"   "0.46\n（0.5）"   
 3 Gender       "0.54\n（0.5）"   "0.55\n（0.5）"   
 4 Age          "49.39\n（14.1）" "55.44\n（14.39）"
 5 Health       "0.85\n（0.36）"  "0.72\n（0.45）"  
 6 Marriage     "0.86\n（0.35）"  "0.79\n（0.41）"  
 7 Education    "7.56\n（4.16）"  "4.67\n（4.11）"  
 8 Insurance    "0.93\n（0.26）"  "0.88\n（0.32）"  
 9 Allowance    "0.49\n（0.5）"   "0.62\n（0.49）"  
10 Size         "3.9\n（1.91）"   "3.77\n（2.1）"   
11 Ratio        "0.46\n（1.01）"  "0.98\n（1.48）"

4.2 k=0.4

k <- 0.4
data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0)) %>% 
  group_by(Repoverty) %>% 
  calculate_mean_sd(c("Educationfee", "Nonagr", "Gender",
                      "Age", "Health", "Marriage", "Education",
                      "Insurance", "Allowance", "Size", "Ratio")) %>% 
  mutate(across(where(is.numeric), ~ round(.x, 2))) %>% 
  pivot_longer(-1,
             names_sep = "_",
             names_to = c("variable", ".value")) %>% 
  mutate(mean_sd = str_c(mean, "\n（", sd, "）")) %>% 
  pivot_wider(id_cols = variable,
              names_from = Repoverty,
              values_from = mean_sd)

# A tibble: 11 × 3
   variable     `0`                `1`              
   <chr>        <chr>              <chr>            
 1 Educationfee "0.95\n（2.74）"   "0.48\n（1.97）" 
 2 Nonagr       "0.54\n（0.5）"    "0.36\n（0.48）" 
 3 Gender       "0.55\n（0.5）"    "0.55\n（0.5）"  
 4 Age          "50.74\n（14.26）" "57.61\n（14.4）"
 5 Health       "0.82\n（0.38）"   "0.66\n（0.47）" 
 6 Marriage     "0.85\n（0.36）"   "0.74\n（0.44）" 
 7 Education    "6.9\n（4.28）"    "3.69\n（3.83）" 
 8 Insurance    "0.92\n（0.27）"   "0.86\n（0.35）" 
 9 Allowance    "0.52\n（0.5）"    "0.64\n（0.48）" 
10 Size         "3.91\n（1.96）"   "3.55\n（2.14）" 
11 Ratio        "0.56\n（1.12）"   "1.25\n（1.64）"

4.3 k=0.5

k <- 0.5
data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0)) %>% 
  group_by(Repoverty) %>% 
  calculate_mean_sd(c("Educationfee", "Nonagr", "Gender",
                      "Age", "Health", "Marriage", "Education",
                      "Insurance", "Allowance", "Size", "Ratio")) %>% 
  mutate(across(where(is.numeric), ~ round(.x, 2))) %>% 
  pivot_longer(-1,
             names_sep = "_",
             names_to = c("variable", ".value")) %>% 
  mutate(mean_sd = str_c(mean, "\n（", sd, "）")) %>% 
  pivot_wider(id_cols = variable,
              names_from = Repoverty,
              values_from = mean_sd)

# A tibble: 11 × 3
   variable     `0`               `1`               
   <chr>        <chr>             <chr>             
 1 Educationfee "0.92\n（2.7）"   "0.03\n（0.46）"  
 2 Nonagr       "0.52\n（0.5）"   "0.25\n（0.44）"  
 3 Gender       "0.55\n（0.5）"   "0.51\n（0.5）"   
 4 Age          "51.4\n（14.36）" "61.55\n（13.73）"
 5 Health       "0.81\n（0.39）"  "0.48\n（0.5）"   
 6 Marriage     "0.84\n（0.37）"  "0.68\n（0.47）"  
 7 Education    "6.52\n（4.35）"  "2.77\n（3.25）"  
 8 Insurance    "0.92\n（0.28）"  "0.81\n（0.39）"  
 9 Allowance    "0.54\n（0.5）"   "0.66\n（0.47）"  
10 Size         "3.91\n（2）"     "2.88\n（1.81）"  
11 Ratio        "0.61\n（1.18）"  "1.75\n（1.82）"

最终对上面三个结果进行合并，得到论文中的表 3。

5 回归模型

5.1 主回归模型（k=0.4）

k <- 0.4
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0))

probit_edu <- glm(Repoverty ~ Educationfee +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)

summary(probit_edu)


Call:
glm(formula = Repoverty ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio + 
    Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6667  -0.6647  -0.4540  -0.2802   2.8737  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.202928   0.112538  -1.803   0.0714 .  
Educationfee -0.034142   0.007503  -4.551 5.35e-06 ***
Gender        0.146105   0.035661   4.097 4.19e-05 ***
Age           0.001852   0.001541   1.202   0.2295    
Health       -0.331897   0.039607  -8.380  < 2e-16 ***
Marriage     -0.107123   0.046022  -2.328   0.0199 *  
Education    -0.086578   0.004335 -19.973  < 2e-16 ***
Insurance    -0.266796   0.056794  -4.698 2.63e-06 ***
Allowance     0.220714   0.036228   6.092 1.11e-09 ***
Size         -0.003814   0.009312  -0.410   0.6821    
Ratio         0.122604   0.015308   8.009 1.16e-15 ***
Region中部   -0.033246   0.044138  -0.753   0.4513    
Region西部    0.092031   0.042763   2.152   0.0314 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 8143.8  on 8215  degrees of freedom
Residual deviance: 7048.6  on 8203  degrees of freedom
AIC: 7074.6

Number of Fisher Scoring iterations: 5

Tip

Probit 回归模型回归系数解释需计算边际效应。

library(margins)
summary(margins(probit_edu))

       factor     AME     SE        z      p   lower   upper
          Age  0.0004 0.0004   1.2018 0.2295 -0.0003  0.0012
    Allowance  0.0527 0.0086   6.1198 0.0000  0.0358  0.0695
    Education -0.0207 0.0010 -21.0489 0.0000 -0.0226 -0.0187
 Educationfee -0.0081 0.0018  -4.5606 0.0000 -0.0116 -0.0046
       Gender  0.0349 0.0085   4.1045 0.0000  0.0182  0.0515
       Health -0.0792 0.0094  -8.4657 0.0000 -0.0975 -0.0609
    Insurance -0.0637 0.0135  -4.7129 0.0000 -0.0901 -0.0372
     Marriage -0.0256 0.0110  -2.3298 0.0198 -0.0471 -0.0041
        Ratio  0.0293 0.0036   8.0976 0.0000  0.0222  0.0363
   Region西部  0.0224 0.0104   2.1484 0.0317  0.0020  0.0428
   Region中部 -0.0077 0.0102  -0.7546 0.4505 -0.0277  0.0123
         Size -0.0009 0.0022  -0.4096 0.6821 -0.0053  0.0034

可以得到家庭教育支出的边际效应为 -0.0081。

5.2 中介效应模型（k=0.4）

probit_nonagr <- glm(Nonagr ~ Educationfee + 
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)
summary(probit_nonagr)


Call:
glm(formula = Nonagr ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio + 
    Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7584  -1.0810   0.3792   1.0381   2.4863  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.175649   0.097354  -1.804 0.071195 .  
Educationfee  0.022311   0.005674   3.932 8.41e-05 ***
Gender       -0.076264   0.030430  -2.506 0.012205 *  
Age          -0.006303   0.001288  -4.895 9.84e-07 ***
Health        0.040190   0.037410   1.074 0.282687    
Marriage     -0.032308   0.042593  -0.759 0.448128    
Education    -0.009192   0.003688  -2.492 0.012698 *  
Insurance    -0.094678   0.052977  -1.787 0.073913 .  
Allowance     0.102858   0.030947   3.324 0.000888 ***
Size          0.165113   0.008336  19.807  < 2e-16 ***
Ratio        -0.236237   0.015969 -14.793  < 2e-16 ***
Region中部    0.198323   0.037178   5.334 9.58e-08 ***
Region西部    0.195839   0.036656   5.343 9.16e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11390  on 8215  degrees of freedom
Residual deviance: 10025  on 8203  degrees of freedom
AIC: 10051

Number of Fisher Scoring iterations: 4

probit_renonagr <- glm(Repoverty ~ Educationfee + Nonagr +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)
summary(probit_renonagr)


Call:
glm(formula = Repoverty ~ Educationfee + Nonagr + Gender + Age + 
    Health + Marriage + Education + Insurance + Allowance + Size + 
    Ratio + Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6826  -0.6552  -0.4439  -0.2676   2.8130  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.070138   0.114095  -0.615 0.538733    
Educationfee -0.031552   0.007565  -4.170 3.04e-05 ***
Nonagr       -0.313968   0.037539  -8.364  < 2e-16 ***
Gender        0.137459   0.035821   3.837 0.000124 ***
Age           0.001070   0.001548   0.691 0.489526    
Health       -0.330631   0.039777  -8.312  < 2e-16 ***
Marriage     -0.110826   0.046260  -2.396 0.016587 *  
Education    -0.087310   0.004342 -20.108  < 2e-16 ***
Insurance    -0.280394   0.057110  -4.910 9.12e-07 ***
Allowance     0.229645   0.036410   6.307 2.84e-10 ***
Size          0.015948   0.009585   1.664 0.096142 .  
Ratio         0.103773   0.015514   6.689 2.25e-11 ***
Region中部   -0.017044   0.044389  -0.384 0.700997    
Region西部    0.112843   0.043060   2.621 0.008778 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 8143.8  on 8215  degrees of freedom
Residual deviance: 6978.5  on 8202  degrees of freedom
AIC: 7006.5

Number of Fisher Scoring iterations: 5

6 工具变量回归（内生性检验）

library(AER) # 加载AER包

6.1 k=0.3

k <- 0.3
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0))

lvprobit_data <- probit_data %>% 
  mutate(.by = provcd18,
         Ivedufee = log((sum(家庭总教育) - 家庭总教育) / (n() - 1) + 1))

# 拟合IV回归模型
ivmodel <- ivreg(Repoverty ~ Educationfee | Ivedufee + Nonagr +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
         data = lvprobit_data)

# 检验是否存在内生性
summary(ivmodel, diagnostics = TRUE)


Call:
ivreg(formula = Repoverty ~ Educationfee | Ivedufee + Nonagr + 
    Gender + Age + Health + Marriage + Education + Insurance + 
    Allowance + Size + Ratio + Region, data = lvprobit_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.5169 -0.5169  0.1125  0.4831  1.4524 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.51693    0.01106  46.740  < 2e-16 ***
Educationfee -0.08131    0.01089  -7.467 9.03e-14 ***

Diagnostic tests:
                  df1  df2 statistic  p-value    
Weak instruments   13 8199     27.79  < 2e-16 ***
Wu-Hausman          1 8210     51.18 9.14e-13 ***
Sargan             12   NA   1136.63  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5303 on 8211 degrees of freedom
Multiple R-Squared: -0.1374,    Adjusted R-squared: -0.1375 
Wald test: 55.76 on 1 and 8211 DF,  p-value: 9.033e-14

6.2 k=0.4

k <- 0.4
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0))

lvprobit_data <- probit_data %>% 
  mutate(.by = provcd18,
         Ivedufee = log((sum(家庭总教育) - 家庭总教育) / (n() - 1) + 1))

# 拟合IV回归模型
ivmodel <- ivreg(Repoverty ~ Educationfee | Ivedufee + Nonagr +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
         data = lvprobit_data)

# 检验是否存在内生性
summary(ivmodel, diagnostics = TRUE)


Call:
ivreg(formula = Repoverty ~ Educationfee | Ivedufee + Nonagr + 
    Gender + Age + Health + Marriage + Education + Insurance + 
    Allowance + Size + Ratio + Region, data = lvprobit_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.2887 -0.2887 -0.2887  0.6229  1.8444 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.288679   0.009784   29.50   <2e-16 ***
Educationfee -0.106781   0.009633  -11.09   <2e-16 ***

Diagnostic tests:
                  df1  df2 statistic p-value    
Weak instruments   13 8199     27.79  <2e-16 ***
Wu-Hausman          1 8210    147.31  <2e-16 ***
Sargan             12   NA    710.63  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4691 on 8211 degrees of freedom
Multiple R-Squared: -0.3928,    Adjusted R-squared: -0.393 
Wald test: 122.9 on 1 and 8211 DF,  p-value: < 2.2e-16

6.3 k=0.5

k <- 0.5
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0))

lvprobit_data <- probit_data %>% 
  mutate(.by = provcd18,
         Ivedufee = log((sum(家庭总教育) - 家庭总教育) / (n() - 1) + 1))

# 拟合IV回归模型
ivmodel <- ivreg(Repoverty ~ Educationfee | Ivedufee + Nonagr +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
         data = lvprobit_data)

# 检验是否存在内生性
summary(ivmodel, diagnostics = TRUE)


Call:
ivreg(formula = Repoverty ~ Educationfee | Ivedufee + Nonagr + 
    Gender + Age + Health + Marriage + Education + Insurance + 
    Allowance + Size + Ratio + Region, data = lvprobit_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1467 -0.1467 -0.1467 -0.1467  1.5488 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.146670   0.006915   21.21   <2e-16 ***
Educationfee -0.090914   0.006808  -13.35   <2e-16 ***

Diagnostic tests:
                  df1  df2 statistic p-value    
Weak instruments   13 8199     27.79  <2e-16 ***
Wu-Hausman          1 8210    276.65  <2e-16 ***
Sargan             12   NA    401.67  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3316 on 8211 degrees of freedom
Multiple R-Squared: -0.727, Adjusted R-squared: -0.7272 
Wald test: 178.3 on 1 and 8211 DF,  p-value: < 2.2e-16

7 稳健性检验

7.1 k=0.3

k <- 0.3
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0))

probit_edu <- glm(Repoverty ~ Educationfee +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)

summary(probit_edu)


Call:
glm(formula = Repoverty ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio + 
    Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3387  -0.9550  -0.6006   1.0478   2.4057  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.285156   0.098412   2.898 0.003761 ** 
Educationfee -0.021324   0.005765  -3.699 0.000217 ***
Gender        0.114242   0.030726   3.718 0.000201 ***
Age           0.004723   0.001309   3.609 0.000307 ***
Health       -0.292155   0.036924  -7.912 2.53e-15 ***
Marriage     -0.103635   0.042196  -2.456 0.014047 *  
Education    -0.081042   0.003739 -21.677  < 2e-16 ***
Insurance    -0.331956   0.053200  -6.240 4.38e-10 ***
Allowance     0.230230   0.030963   7.436 1.04e-13 ***
Size          0.012157   0.007930   1.533 0.125283    
Ratio         0.114665   0.014458   7.931 2.18e-15 ***
Region中部    0.146900   0.037431   3.925 8.69e-05 ***
Region西部    0.325649   0.036970   8.808  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11296.6  on 8215  degrees of freedom
Residual deviance:  9885.8  on 8203  degrees of freedom
AIC: 9911.8

Number of Fisher Scoring iterations: 4

probit_nonagr <- glm(Nonagr ~ Educationfee + 
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)
summary(probit_nonagr)


Call:
glm(formula = Nonagr ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio + 
    Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7584  -1.0810   0.3792   1.0381   2.4863  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.175649   0.097354  -1.804 0.071195 .  
Educationfee  0.022311   0.005674   3.932 8.41e-05 ***
Gender       -0.076264   0.030430  -2.506 0.012205 *  
Age          -0.006303   0.001288  -4.895 9.84e-07 ***
Health        0.040190   0.037410   1.074 0.282687    
Marriage     -0.032308   0.042593  -0.759 0.448128    
Education    -0.009192   0.003688  -2.492 0.012698 *  
Insurance    -0.094678   0.052977  -1.787 0.073913 .  
Allowance     0.102858   0.030947   3.324 0.000888 ***
Size          0.165113   0.008336  19.807  < 2e-16 ***
Ratio        -0.236237   0.015969 -14.793  < 2e-16 ***
Region中部    0.198323   0.037178   5.334 9.58e-08 ***
Region西部    0.195839   0.036656   5.343 9.16e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11390  on 8215  degrees of freedom
Residual deviance: 10025  on 8203  degrees of freedom
AIC: 10051

Number of Fisher Scoring iterations: 4

probit_renonagr <- glm(Repoverty ~ Educationfee + Nonagr +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)
summary(probit_renonagr)


Call:
glm(formula = Repoverty ~ Educationfee + Nonagr + Gender + Age + 
    Health + Marriage + Education + Insurance + Allowance + Size + 
    Ratio + Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3620  -0.9507  -0.6008   1.0490   2.3628  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.351889   0.099556   3.535 0.000408 ***
Educationfee -0.019947   0.005778  -3.452 0.000556 ***
Nonagr       -0.147761   0.031785  -4.649 3.34e-06 ***
Gender        0.110441   0.030764   3.590 0.000331 ***
Age           0.004383   0.001312   3.341 0.000834 ***
Health       -0.290593   0.036966  -7.861 3.81e-15 ***
Marriage     -0.105533   0.042251  -2.498 0.012497 *  
Education    -0.081554   0.003742 -21.795  < 2e-16 ***
Insurance    -0.338226   0.053296  -6.346 2.21e-10 ***
Allowance     0.235217   0.031014   7.584 3.35e-14 ***
Size          0.020905   0.008158   2.562 0.010397 *  
Ratio         0.104714   0.014630   7.157 8.23e-13 ***
Region中部    0.156955   0.037529   4.182 2.89e-05 ***
Region西部    0.336589   0.037085   9.076  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11296.6  on 8215  degrees of freedom
Residual deviance:  9864.2  on 8202  degrees of freedom
AIC: 9892.2

Number of Fisher Scoring iterations: 4

7.2 k=0.5

k <- 0.5
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0))

probit_edu <- glm(Repoverty ~ Educationfee +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)

summary(probit_edu)


Call:
glm(formula = Repoverty ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio + 
    Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5272  -0.3748  -0.2034  -0.1154   3.8503  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -3.440e-01  1.620e-01  -2.123   0.0337 *  
Educationfee -1.467e-01  2.960e-02  -4.956 7.19e-07 ***
Gender        1.049e-01  5.130e-02   2.044   0.0409 *  
Age           4.501e-05  2.340e-03   0.019   0.9847    
Health       -6.217e-01  5.151e-02 -12.069  < 2e-16 ***
Marriage     -7.578e-03  6.236e-02  -0.122   0.9033    
Education    -8.923e-02  6.509e-03 -13.709  < 2e-16 ***
Insurance    -4.324e-01  7.205e-02  -6.001 1.96e-09 ***
Allowance     2.589e-01  5.279e-02   4.904 9.37e-07 ***
Size         -7.284e-02  1.509e-02  -4.827 1.39e-06 ***
Ratio         1.353e-01  2.027e-02   6.676 2.46e-11 ***
Region中部    4.283e-02  6.199e-02   0.691   0.4896    
Region西部    2.749e-02  6.241e-02   0.440   0.6596    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4094.4  on 8215  degrees of freedom
Residual deviance: 3184.4  on 8203  degrees of freedom
AIC: 3210.4

Number of Fisher Scoring iterations: 8

probit_nonagr <- glm(Nonagr ~ Educationfee + 
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)
summary(probit_nonagr)


Call:
glm(formula = Nonagr ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio + 
    Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7584  -1.0810   0.3792   1.0381   2.4863  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.175649   0.097354  -1.804 0.071195 .  
Educationfee  0.022311   0.005674   3.932 8.41e-05 ***
Gender       -0.076264   0.030430  -2.506 0.012205 *  
Age          -0.006303   0.001288  -4.895 9.84e-07 ***
Health        0.040190   0.037410   1.074 0.282687    
Marriage     -0.032308   0.042593  -0.759 0.448128    
Education    -0.009192   0.003688  -2.492 0.012698 *  
Insurance    -0.094678   0.052977  -1.787 0.073913 .  
Allowance     0.102858   0.030947   3.324 0.000888 ***
Size          0.165113   0.008336  19.807  < 2e-16 ***
Ratio        -0.236237   0.015969 -14.793  < 2e-16 ***
Region中部    0.198323   0.037178   5.334 9.58e-08 ***
Region西部    0.195839   0.036656   5.343 9.16e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11390  on 8215  degrees of freedom
Residual deviance: 10025  on 8203  degrees of freedom
AIC: 10051

Number of Fisher Scoring iterations: 4

probit_renonagr <- glm(Repoverty ~ Educationfee + Nonagr +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio + Region,
          family = binomial(link = "probit"),
          data = probit_data)
summary(probit_renonagr)


Call:
glm(formula = Repoverty ~ Educationfee + Nonagr + Gender + Age + 
    Health + Marriage + Education + Insurance + Allowance + Size + 
    Ratio + Region, family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5362  -0.3668  -0.2003  -0.1073   3.7236  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.2193492  0.1646580  -1.332  0.18281    
Educationfee -0.1482288  0.0301332  -4.919 8.69e-07 ***
Nonagr       -0.3255706  0.0571351  -5.698 1.21e-08 ***
Gender        0.0940248  0.0515763   1.823  0.06830 .  
Age          -0.0009682  0.0023496  -0.412  0.68030    
Health       -0.6226487  0.0517813 -12.025  < 2e-16 ***
Marriage     -0.0112603  0.0628008  -0.179  0.85770    
Education    -0.0896268  0.0065224 -13.741  < 2e-16 ***
Insurance    -0.4458776  0.0725525  -6.146 7.97e-10 ***
Allowance     0.2669269  0.0530878   5.028 4.96e-07 ***
Size         -0.0479215  0.0154876  -3.094  0.00197 ** 
Ratio         0.1193959  0.0204703   5.833 5.46e-09 ***
Region中部    0.0595985  0.0623580   0.956  0.33920    
Region西部    0.0484851  0.0628922   0.771  0.44075    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4094.4  on 8215  degrees of freedom
Residual deviance: 3151.7  on 8202  degrees of freedom
AIC: 3179.7

Number of Fisher Scoring iterations: 8

8 异质性检验

8.1 东部

k <- 0.4
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0)) %>% 
  filter(Region == "东部")

probit_edu <- glm(Repoverty ~ Educationfee +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio,
          family = binomial(link = "probit"),
          data = probit_data)

summary(probit_edu)


Call:
glm(formula = Repoverty ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio, 
    family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8087  -0.6294  -0.4367  -0.2674   2.6936  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.480900   0.184167  -2.611 0.009022 ** 
Educationfee -0.028633   0.014493  -1.976 0.048199 *  
Gender        0.011474   0.058084   0.198 0.843402    
Age           0.006552   0.002616   2.505 0.012248 *  
Health       -0.350706   0.064469  -5.440 5.33e-08 ***
Marriage     -0.164711   0.074952  -2.198 0.027982 *  
Education    -0.074977   0.007439 -10.079  < 2e-16 ***
Insurance    -0.296652   0.081183  -3.654 0.000258 ***
Allowance     0.435848   0.056437   7.723 1.14e-14 ***
Size          0.015103   0.015335   0.985 0.324676    
Ratio         0.062306   0.023642   2.635 0.008404 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3013.4  on 3229  degrees of freedom
Residual deviance: 2602.5  on 3219  degrees of freedom
AIC: 2624.5

Number of Fisher Scoring iterations: 5

8.2 中部

k <- 0.4
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0)) %>% 
  filter(Region == "中部")

probit_edu <- glm(Repoverty ~ Educationfee +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio,
          family = binomial(link = "probit"),
          data = probit_data)

summary(probit_edu)


Call:
glm(formula = Repoverty ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio, 
    family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8301  -0.6163  -0.4466  -0.2815   2.6645  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.399540   0.222265  -1.798  0.07224 .  
Educationfee -0.049391   0.016634  -2.969  0.00298 ** 
Gender        0.184534   0.069261   2.664  0.00771 ** 
Age           0.007143   0.003159   2.261  0.02377 *  
Health       -0.292702   0.074905  -3.908 9.32e-05 ***
Marriage      0.031026   0.092941   0.334  0.73851    
Education    -0.074323   0.008350  -8.901  < 2e-16 ***
Insurance    -0.460124   0.117847  -3.904 9.45e-05 ***
Allowance     0.049653   0.070392   0.705  0.48057    
Size         -0.022708   0.018598  -1.221  0.22208    
Ratio         0.131129   0.029276   4.479 7.50e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2179.8  on 2268  degrees of freedom
Residual deviance: 1864.9  on 2258  degrees of freedom
AIC: 1886.9

Number of Fisher Scoring iterations: 5

8.3 西部

k <- 0.4
probit_data <- data %>% 
  mutate(Repoverty = ifelse(Score >= k, 1, 0)) %>% 
  filter(Region == "西部")

probit_edu <- glm(Repoverty ~ Educationfee +
         Gender + Age + Health + Marriage + Education +
         Insurance + Allowance + Size + Ratio,
          family = binomial(link = "probit"),
          data = probit_data)

summary(probit_edu)


Call:
glm(formula = Repoverty ~ Educationfee + Gender + Age + Health + 
    Marriage + Education + Insurance + Allowance + Size + Ratio, 
    family = binomial(link = "probit"), data = probit_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8651  -0.7154  -0.4620  -0.2381   2.8157  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.208166   0.185631   1.121  0.26212    
Educationfee -0.030610   0.010588  -2.891  0.00384 ** 
Gender        0.269974   0.061192   4.412 1.02e-05 ***
Age          -0.005673   0.002485  -2.283  0.02240 *  
Health       -0.360215   0.068854  -5.232 1.68e-07 ***
Marriage     -0.142701   0.076581  -1.863  0.06241 .  
Education    -0.107842   0.007204 -14.969  < 2e-16 ***
Insurance    -0.054482   0.109007  -0.500  0.61722    
Allowance     0.089322   0.062890   1.420  0.15553    
Size         -0.010170   0.015347  -0.663  0.50753    
Ratio         0.200941   0.029450   6.823 8.90e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2923.4  on 2716  degrees of freedom
Residual deviance: 2501.3  on 2706  degrees of freedom
AIC: 2523.3

Number of Fisher Scoring iterations: 5