分析流程

讀進資料

並將 ID 視為類別變項(factor)

dta <- read.csv('Chemistry.csv',header=T)
head(dta)
##   area schID stuID chem gender sciscore
## 1    1     1     1    4      F    6.625
## 2    1     1     2   10      F    7.625
## 3    1     1     3   10      F    7.250
## 4    1     1     4   10      F    7.500
## 5    1     1     5    8      F    6.444
## 6    1     1     6   10      F    7.750
dta$schID <- as.factor(dta$schID)
dta$area <- as.factor(dta$area)

先看描述統計量

sapply(dta[, c('chem', 'sciscore')], summary)
##              chem sciscore
## Min.     0.000000 0.000000
## 1st Qu.  4.000000 5.750000
## Median   6.000000 6.375000
## Mean     5.812585 6.285684
## 3rd Qu.  8.000000 6.900000
## Max.    10.000000 8.000000
sapply(dta[, c('chem', 'sciscore')], sd)
##      chem  sciscore 
## 3.3191673 0.8734512

載入 lme4 套件,用來分析多層次資料

lmerTest 加入 p-value, misty 用來幫忙置中

library(pacman)
## Warning: 套件 'pacman' 是用 R 版本 4.2.3 來建造的
p_load(lme4,lmerTest,misty)

以下是第二部份,正式分析

我們把 chem 以分組平均置中

先讓大家看模型怎麼寫

  • 模型中,是 依變項~獨變項 的形式,A:B 是指交互作用項,與迴歸寫法類似
  • (1|schID), (1|area) 表示有學校與區域的隨機截距
  • 知道要把交互作用,隨機截距加進去,其實是因為事先探索過資料了
dta$chem_wc <- center(dta$chem, type = "CWC", cluster = dta$schID)
m0 <- lmer(sciscore ~ chem_wc + gender + chem_wc:gender + ( 1 | schID ) + ( 1 | area ), data = dta ) 
summary(m0)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sciscore ~ chem_wc + gender + chem_wc:gender + (1 | schID) +  
##     (1 | area)
##    Data: dta
## 
## REML criterion at convergence: 57493
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.9336 -0.5632  0.0668  0.6461  4.0701 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  schID    (Intercept) 0.23680  0.4866  
##  area     (Intercept) 0.02649  0.1628  
##  Residual             0.31811  0.5640  
## Number of obs: 31022, groups:  schID, 2410; area, 131
## 
## Fixed effects:
##                   Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)      6.312e+00  2.008e-02  1.061e+02 314.359  < 2e-16 ***
## chem_wc          1.544e-01  1.801e-03  2.878e+04  85.738  < 2e-16 ***
## genderM         -3.410e-01  7.693e-03  3.058e+04 -44.330  < 2e-16 ***
## chem_wc:genderM  9.746e-03  2.384e-03  2.895e+04   4.089 4.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) chm_wc gendrM
## chem_wc     -0.007              
## genderM     -0.204  0.022       
## chm_wc:gndM  0.004 -0.772 -0.007
  • 隨機截距,school 的效果比較大、area 效果比較小
  • 固定效果中,化學、性別及交互作用都顯著,交互作用效果相對較小

試著去除區域的隨機效果,並看看去除隨機效果是否顯著。

m1 <- update(m0, . ~ . - ( 1 | area ) )
anova(m0, m1)
## refitting model(s) with ML (instead of REML)
## Data: dta
## Models:
## m1: sciscore ~ chem_wc + gender + (1 | schID) + chem_wc:gender
## m0: sciscore ~ chem_wc + gender + chem_wc:gender + (1 | schID) + (1 | area)
##    npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## m1    6 57542 57592 -28765    57530                         
## m0    7 57471 57529 -28729    57457 73.226  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • 模型比較顯著時,要選複雜的模型
  • 模型比較不顯著時,要選簡單的模型
  • 包含與不包含區域的隨機截距,差異顯著,顯示區域隨機截距有效果,不應去除

試著去除交互作用項效果,並看看去除交互作用是否顯著。

drop1(m0, test = 'Chisq')
## Single term deletions using Satterthwaite's method:
## 
## Model:
## sciscore ~ chem_wc + gender + chem_wc:gender + (1 | schID) + (1 | area)
##                Sum Sq Mean Sq NumDF DenDF F value   Pr(>F)    
## chem_wc:gender  5.318   5.318     1 28947  16.718 4.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    • 包含與不包含交互作用效果,差異顯著,顯示交互作用有效果,不應去除

抽取變異成分,計算學校與區域可以解釋部分(ICCs)

print(vc <- VarCorr(m0), comp = 'Variance' )
##  Groups   Name        Variance
##  schID    (Intercept) 0.236796
##  area     (Intercept) 0.026494
##  Residual             0.318112
vc <- as.data.frame(vc)
vc[vc$grp=='schID', 'vcov']/ sum(vc$vcov)
## [1] 0.4072844
vc[vc$grp=='area', 'vcov']/ sum(vc$vcov)
## [1] 0.0455687
1 - (vc[vc$grp=='Residual', 'vcov']/ sum(vc$vcov))
## [1] 0.4528531

計算科學成績的 ICC(1), ICC(2), Rwg

##      ICC1      ICC2 
## 0.2678874 0.8248708
##     grpid                rwg             gsize       
##  Length:2410        Min.   :0.0000   Min.   :  1.00  
##  Class :character   1st Qu.:0.6224   1st Qu.:  4.00  
##  Mode  :character   Median :0.7488   Median :  8.00  
##                     Mean   :0.7087   Mean   : 12.87  
##                     3rd Qu.:0.8458   3rd Qu.: 17.00  
##                     Max.   :1.0000   Max.   :188.00  
##                     NA's   :162
  • 後面會另外討論計算 ICC(1), ICC(2)與 RWG 的意涵

底下集中講隨機截距斜率與包含隨機斜率的模型,先是學校的隨機截距

  • (1|schID), (1|area) 表示有學校與區域的隨機截距
m1 <- lmer(sciscore ~ 1 + chem_wc + ( 1 | schID ) , data = dta ) 
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sciscore ~ 1 + chem_wc + (1 | schID)
##    Data: dta
## 
## REML criterion at convergence: 59461.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.7567 -0.5591  0.0710  0.6434  4.4222 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  schID    (Intercept) 0.2731   0.5226  
##  Residual             0.3386   0.5819  
## Number of obs: 31022, groups:  schID, 2410
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 6.150e+00  1.171e-02 2.169e+03   525.0   <2e-16 ***
## chem_wc     1.615e-01  1.181e-03 2.850e+04   136.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## chem_wc 0.000

學校層級的隨機截距、化學隨機斜率

  • (1+chem_wc|schID)表示有學校層級的化學成績隨機斜率
m2 <- lmer(sciscore ~ 1 + chem_wc + ( 1 + chem_wc| schID ) , data = dta,
       control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5))) 
summary(m2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sciscore ~ 1 + chem_wc + (1 + chem_wc | schID)
##    Data: dta
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 59378.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.8217 -0.5567  0.0736  0.6440  4.3327 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  schID    (Intercept) 0.2742609 0.52370       
##           chem_wc     0.0006018 0.02453  -0.36
##  Residual             0.3337245 0.57769       
## Number of obs: 31022, groups:  schID, 2410
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 6.149e+00  1.172e-02 2.170e+03   524.6   <2e-16 ***
## chem_wc     1.636e-01  1.371e-03 1.208e+03   119.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## chem_wc -0.132

以下是第一部份,探索資料

dta_m <- aggregate(cbind(chem, sciscore, gender=="F") ~ schID, 
                   data = dta, mean)
names(dta_m) <- c('schID', 'chem_mean', 'sciscore_mean', 'girlprop')
head(dta_m)
##   schID chem_mean sciscore_mean  girlprop
## 1     1  8.307692      7.224462 1.0000000
## 2     2  8.714286      7.023893 0.0000000
## 3     3  4.666667      5.310000 0.6666667
## 4     4  8.285714      6.232143 0.1428571
## 5     5  5.500000      6.174313 0.4375000
## 6     6  4.714286      6.705000 0.7857143
dta_sd <- aggregate(cbind(chem, sciscore) ~ schID, data = dta, sd)
names(dta_sd) <- c('schID','chem_SD', 'sciscore_SD')
head(dta_sd)
##   schID  chem_SD sciscore_SD
## 1     1 2.287087   0.5394558
## 2     2 2.258435   0.5376998
## 3     3 4.163332   1.0800116
## 4     4 3.147183   0.5702182
## 5     5 2.476557   0.8257799
## 6     6 3.729891   0.7967226

載進 lattice,準備畫圖

  • 以總成績與化學原始分數,繪製直方圖
  • 以學校平均分數,繪製直方圖
  • 以學校標準差,繪製直方圖
  • 這邊都沒有呈現圖,只是把圖做出來
p_load(lattice)

p1 <- histogram(~ sciscore, data = dta, type = 'density', xlab = '總成績',
                ylab = '機率密度')
p2 <- histogram(~ chem, data = dta, type = 'density', xlab = '化學分數', 
                ylab = '機率密度')


p3 <- histogram(~ sciscore_mean, data = dta_m, type = 'density', 
          xlab = '各校總成績平均', ylab = '機率密度')
p4 <- histogram(~ chem_mean, data = dta_m, type = 'density', 
          xlab = '各校化學平均分數', ylab = '機率密度')


p5 <- histogram(~ sciscore_SD, data = dta_sd, type = 'density',
          xlab = '各校總成績標準差', ylab = '機率密度')
          
p6 <- histogram(~ chem_SD, data = dta_sd, type = 'density',
          xlab = '各校化學分數標準差', ylab = '機率密度')

載入gridExtra,把六張放一起

p_load(gridExtra)
grid.arrange(p6, p5, p4, p3, p2, p1, as.table = T)

看看化學與總成績相關

  • 以學生層次分數計算是 .662
  • 以學校層次分數計算生態相關(ecological correlation),為 .698
cor(dta[, c('chem', 'sciscore')])
##               chem  sciscore
## chem     1.0000000 0.6622476
## sciscore 0.6622476 1.0000000
cor(dta_m[, -1])
##                chem_mean sciscore_mean   girlprop
## chem_mean     1.00000000     0.6980651 0.06625497
## sciscore_mean 0.69806507     1.0000000 0.24520916
## girlprop      0.06625497     0.2452092 1.00000000

不只看相關,畫畫看兩者間關聯

plot(dta[, 4], dta[, 6], type = 'n', xlab ='化學分數', ylab = '總成績', 
     asp = 1)
grid()

#學生
points(dta[, 4], dta[, 6], pch = '.', cex = 2)
abline(lm(dta[,6] ~ dta[, 4]))

#學校
points(dta_m[, 2], dta_m[, 3], cex = 0.5, col = 'grey') 
abline(lm(dta_m[,3] ~ dta_m[,2]), col = 'grey')

#對角線
abline(0, 1, lty = 3, col = 'blue')

看看各校與各區域以化學預測總分時的截距與斜率

  • 請注意原始文章的預測變項與解釋變項跟此所作是相反的。
  • 載入 ggplot2 套件,準備畫圖。
p_load(ggplot2)

# 記錄下原始配色
old <- theme_set(theme_bw())

各校以化學預測總分時的截距與斜率

ggplot(data = dta, aes(x = chem, y = sciscore, group = schID))+
 stat_smooth(method = 'lm', se = F, color = 'lightgray') +
 geom_point(size = 1) +
 stat_smooth(aes(group = 1), method= 'lm', se = F, color = 'black') +
 labs(x = '化學分數', y = '總成績', title = '學校')
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

各區域以化學預測總分時的截距與斜率

ggplot(data = dta, aes(x = chem, y = sciscore, group = area))+
 stat_smooth(method = 'lm', se = F, color = 'lightgray') +
 geom_point(size = 1) +
 stat_smooth(aes(group = 1), method = 'lm', se = F, color = 'black') +
 labs(x = '化學分數', y = '總成績' , title = '區域') 
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

剛剛資料太多不清楚,所以隨機選取25個學校與區域重畫

重看一次各校以化學預測總分時的截距與斜率

  • 將變項以總分均置中,亦即,減去總平均。
dta$chem_centered <- scale(dta$chem, scale = F)
set.seed(1225)
ns25 <- sample(levels(dta$schID), 25)
set.seed(1225)
nr25 <- sample(levels(dta$area), 25)


ggplot(data = dta[dta$schID %in% ns25, ], aes(x = chem_centered, y = sciscore, color = gender))+
  geom_point(size = 1) +
  stat_smooth(method = 'lm', se = F) +
  facet_wrap( ~ schID)
## `geom_smooth()` using formula = 'y ~ x'

重看一次各區域以化學預測總分時的截距與斜率

ggplot(data = dta[dta$area %in% nr25, ], aes(x = chem_centered, y = sciscore, color = gender))+
  geom_point(size = 1) +
  stat_smooth(method = 'lm', se = F) +
  facet_wrap( ~ area )
## `geom_smooth()` using formula = 'y ~ x'

底下是第三部份,診斷(檢查假設)

學校與區域層次的隨機效果QQ圖,檢查隨機效果是否呈常態

qq_r21 <- qqmath(~ ranef(m0, condVar = T)$schID, type = c('p', 'g', 'r'), pch = '.',xlab = '常態分位數', ylab = '學校隨機效果分位數')


qq_r22 <- qqmath(~ ranef(m0, condVar = T)$area, type = c('p', 'g', 'r'), pch = '.',xlab = '常態分位數', ylab = '區域隨機效果分位數')

殘差的QQ圖,檢驗殘差是否呈常態

qq_r0 <- qqmath(~ resid(m0, scale = T), type = c('p', 'g', 'r'), pch = '.',xlab = '標準化常態分位數', ylab = '標準化殘差')

預測值對殘差圖,檢查殘差與獨變項無關的假設

m0_f <- fortify.merMod(m0)
r_m0 <- ggplot(data = m0_f, aes(x = .fitted, y = .scresid)) + 
          geom_point(pch = '.') +
          stat_smooth(method = 'loess', se = F) +
          labs( x = '預測值', y = '標準化殘差')

把圖放在一起呈現

grid.arrange(qq_r21, qq_r22, qq_r0, r_m0, nrow = 2, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'