讀進資料
並將 ID 視為類別變項(factor)
dta <- read.csv('Chemistry.csv',header=T)
head(dta)
## area schID stuID chem gender sciscore
## 1 1 1 1 4 F 6.625
## 2 1 1 2 10 F 7.625
## 3 1 1 3 10 F 7.250
## 4 1 1 4 10 F 7.500
## 5 1 1 5 8 F 6.444
## 6 1 1 6 10 F 7.750
dta$schID <- as.factor(dta$schID)
dta$area <- as.factor(dta$area)
先看描述統計量
sapply(dta[, c('chem', 'sciscore')], summary)
## chem sciscore
## Min. 0.000000 0.000000
## 1st Qu. 4.000000 5.750000
## Median 6.000000 6.375000
## Mean 5.812585 6.285684
## 3rd Qu. 8.000000 6.900000
## Max. 10.000000 8.000000
sapply(dta[, c('chem', 'sciscore')], sd)
## chem sciscore
## 3.3191673 0.8734512
載入 lme4 套件,用來分析多層次資料
lmerTest 加入 p-value, misty 用來幫忙置中
library(pacman)
## Warning: 套件 'pacman' 是用 R 版本 4.2.3 來建造的
p_load(lme4,lmerTest,misty)
以下是第二部份,正式分析
我們把 chem 以分組平均置中
先讓大家看模型怎麼寫
- 模型中,是 依變項~獨變項 的形式,A:B
是指交互作用項,與迴歸寫法類似
- (1|schID), (1|area) 表示有學校與區域的隨機截距
- 知道要把交互作用,隨機截距加進去,其實是因為事先探索過資料了
dta$chem_wc <- center(dta$chem, type = "CWC", cluster = dta$schID)
m0 <- lmer(sciscore ~ chem_wc + gender + chem_wc:gender + ( 1 | schID ) + ( 1 | area ), data = dta )
summary(m0)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sciscore ~ chem_wc + gender + chem_wc:gender + (1 | schID) +
## (1 | area)
## Data: dta
##
## REML criterion at convergence: 57493
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -8.9336 -0.5632 0.0668 0.6461 4.0701
##
## Random effects:
## Groups Name Variance Std.Dev.
## schID (Intercept) 0.23680 0.4866
## area (Intercept) 0.02649 0.1628
## Residual 0.31811 0.5640
## Number of obs: 31022, groups: schID, 2410; area, 131
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.312e+00 2.008e-02 1.061e+02 314.359 < 2e-16 ***
## chem_wc 1.544e-01 1.801e-03 2.878e+04 85.738 < 2e-16 ***
## genderM -3.410e-01 7.693e-03 3.058e+04 -44.330 < 2e-16 ***
## chem_wc:genderM 9.746e-03 2.384e-03 2.895e+04 4.089 4.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) chm_wc gendrM
## chem_wc -0.007
## genderM -0.204 0.022
## chm_wc:gndM 0.004 -0.772 -0.007
- 隨機截距,school 的效果比較大、area 效果比較小
- 固定效果中,化學、性別及交互作用都顯著,交互作用效果相對較小
試著去除區域的隨機效果,並看看去除隨機效果是否顯著。
m1 <- update(m0, . ~ . - ( 1 | area ) )
anova(m0, m1)
## refitting model(s) with ML (instead of REML)
## Data: dta
## Models:
## m1: sciscore ~ chem_wc + gender + (1 | schID) + chem_wc:gender
## m0: sciscore ~ chem_wc + gender + chem_wc:gender + (1 | schID) + (1 | area)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m1 6 57542 57592 -28765 57530
## m0 7 57471 57529 -28729 57457 73.226 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
- 模型比較顯著時,要選複雜的模型
- 模型比較不顯著時,要選簡單的模型
- 包含與不包含區域的隨機截距,差異顯著,顯示區域隨機截距有效果,不應去除
試著去除交互作用項效果,並看看去除交互作用是否顯著。
drop1(m0, test = 'Chisq')
## Single term deletions using Satterthwaite's method:
##
## Model:
## sciscore ~ chem_wc + gender + chem_wc:gender + (1 | schID) + (1 | area)
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## chem_wc:gender 5.318 5.318 1 28947 16.718 4.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
- 包含與不包含交互作用效果,差異顯著,顯示交互作用有效果,不應去除
抽取變異成分,計算學校與區域可以解釋部分(ICCs)
print(vc <- VarCorr(m0), comp = 'Variance' )
## Groups Name Variance
## schID (Intercept) 0.236796
## area (Intercept) 0.026494
## Residual 0.318112
vc <- as.data.frame(vc)
vc[vc$grp=='schID', 'vcov']/ sum(vc$vcov)
## [1] 0.4072844
vc[vc$grp=='area', 'vcov']/ sum(vc$vcov)
## [1] 0.0455687
1 - (vc[vc$grp=='Residual', 'vcov']/ sum(vc$vcov))
## [1] 0.4528531
計算科學成績的 ICC(1), ICC(2), Rwg
## ICC1 ICC2
## 0.2678874 0.8248708
## grpid rwg gsize
## Length:2410 Min. :0.0000 Min. : 1.00
## Class :character 1st Qu.:0.6224 1st Qu.: 4.00
## Mode :character Median :0.7488 Median : 8.00
## Mean :0.7087 Mean : 12.87
## 3rd Qu.:0.8458 3rd Qu.: 17.00
## Max. :1.0000 Max. :188.00
## NA's :162
- 後面會另外討論計算 ICC(1), ICC(2)與 RWG 的意涵
底下集中講隨機截距斜率與包含隨機斜率的模型,先是學校的隨機截距
- (1|schID), (1|area) 表示有學校與區域的隨機截距
m1 <- lmer(sciscore ~ 1 + chem_wc + ( 1 | schID ) , data = dta )
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sciscore ~ 1 + chem_wc + (1 | schID)
## Data: dta
##
## REML criterion at convergence: 59461.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -8.7567 -0.5591 0.0710 0.6434 4.4222
##
## Random effects:
## Groups Name Variance Std.Dev.
## schID (Intercept) 0.2731 0.5226
## Residual 0.3386 0.5819
## Number of obs: 31022, groups: schID, 2410
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.150e+00 1.171e-02 2.169e+03 525.0 <2e-16 ***
## chem_wc 1.615e-01 1.181e-03 2.850e+04 136.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## chem_wc 0.000
學校層級的隨機截距、化學隨機斜率
- (1+chem_wc|schID)表示有學校層級的化學成績隨機斜率
m2 <- lmer(sciscore ~ 1 + chem_wc + ( 1 + chem_wc| schID ) , data = dta,
control=lmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
summary(m2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sciscore ~ 1 + chem_wc + (1 + chem_wc | schID)
## Data: dta
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## REML criterion at convergence: 59378.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -8.8217 -0.5567 0.0736 0.6440 4.3327
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## schID (Intercept) 0.2742609 0.52370
## chem_wc 0.0006018 0.02453 -0.36
## Residual 0.3337245 0.57769
## Number of obs: 31022, groups: schID, 2410
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 6.149e+00 1.172e-02 2.170e+03 524.6 <2e-16 ***
## chem_wc 1.636e-01 1.371e-03 1.208e+03 119.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## chem_wc -0.132
以下是第一部份,探索資料
- 計算各校化學與總成績平均分數,記錄成資料檔,留待後用
- 計算學校標準差,存檔留後用
dta_m <- aggregate(cbind(chem, sciscore, gender=="F") ~ schID,
data = dta, mean)
names(dta_m) <- c('schID', 'chem_mean', 'sciscore_mean', 'girlprop')
head(dta_m)
## schID chem_mean sciscore_mean girlprop
## 1 1 8.307692 7.224462 1.0000000
## 2 2 8.714286 7.023893 0.0000000
## 3 3 4.666667 5.310000 0.6666667
## 4 4 8.285714 6.232143 0.1428571
## 5 5 5.500000 6.174313 0.4375000
## 6 6 4.714286 6.705000 0.7857143
dta_sd <- aggregate(cbind(chem, sciscore) ~ schID, data = dta, sd)
names(dta_sd) <- c('schID','chem_SD', 'sciscore_SD')
head(dta_sd)
## schID chem_SD sciscore_SD
## 1 1 2.287087 0.5394558
## 2 2 2.258435 0.5376998
## 3 3 4.163332 1.0800116
## 4 4 3.147183 0.5702182
## 5 5 2.476557 0.8257799
## 6 6 3.729891 0.7967226
載進 lattice,準備畫圖
- 以總成績與化學原始分數,繪製直方圖
- 以學校平均分數,繪製直方圖
- 以學校標準差,繪製直方圖
- 這邊都沒有呈現圖,只是把圖做出來
p_load(lattice)
p1 <- histogram(~ sciscore, data = dta, type = 'density', xlab = '總成績',
ylab = '機率密度')
p2 <- histogram(~ chem, data = dta, type = 'density', xlab = '化學分數',
ylab = '機率密度')
p3 <- histogram(~ sciscore_mean, data = dta_m, type = 'density',
xlab = '各校總成績平均', ylab = '機率密度')
p4 <- histogram(~ chem_mean, data = dta_m, type = 'density',
xlab = '各校化學平均分數', ylab = '機率密度')
p5 <- histogram(~ sciscore_SD, data = dta_sd, type = 'density',
xlab = '各校總成績標準差', ylab = '機率密度')
p6 <- histogram(~ chem_SD, data = dta_sd, type = 'density',
xlab = '各校化學分數標準差', ylab = '機率密度')
看看化學與總成績相關
- 以學生層次分數計算是 .662
- 以學校層次分數計算生態相關(ecological correlation),為 .698
cor(dta[, c('chem', 'sciscore')])
## chem sciscore
## chem 1.0000000 0.6622476
## sciscore 0.6622476 1.0000000
cor(dta_m[, -1])
## chem_mean sciscore_mean girlprop
## chem_mean 1.00000000 0.6980651 0.06625497
## sciscore_mean 0.69806507 1.0000000 0.24520916
## girlprop 0.06625497 0.2452092 1.00000000
不只看相關,畫畫看兩者間關聯
plot(dta[, 4], dta[, 6], type = 'n', xlab ='化學分數', ylab = '總成績',
asp = 1)
grid()
#學生
points(dta[, 4], dta[, 6], pch = '.', cex = 2)
abline(lm(dta[,6] ~ dta[, 4]))
#學校
points(dta_m[, 2], dta_m[, 3], cex = 0.5, col = 'grey')
abline(lm(dta_m[,3] ~ dta_m[,2]), col = 'grey')
#對角線
abline(0, 1, lty = 3, col = 'blue')
看看各校與各區域以化學預測總分時的截距與斜率
- 請注意原始文章的預測變項與解釋變項跟此所作是相反的。
- 載入 ggplot2 套件,準備畫圖。
p_load(ggplot2)
# 記錄下原始配色
old <- theme_set(theme_bw())
各校以化學預測總分時的截距與斜率
ggplot(data = dta, aes(x = chem, y = sciscore, group = schID))+
stat_smooth(method = 'lm', se = F, color = 'lightgray') +
geom_point(size = 1) +
stat_smooth(aes(group = 1), method= 'lm', se = F, color = 'black') +
labs(x = '化學分數', y = '總成績', title = '學校')
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
各區域以化學預測總分時的截距與斜率
ggplot(data = dta, aes(x = chem, y = sciscore, group = area))+
stat_smooth(method = 'lm', se = F, color = 'lightgray') +
geom_point(size = 1) +
stat_smooth(aes(group = 1), method = 'lm', se = F, color = 'black') +
labs(x = '化學分數', y = '總成績' , title = '區域')
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
剛剛資料太多不清楚,所以隨機選取25個學校與區域重畫
重看一次各校以化學預測總分時的截距與斜率
dta$chem_centered <- scale(dta$chem, scale = F)
set.seed(1225)
ns25 <- sample(levels(dta$schID), 25)
set.seed(1225)
nr25 <- sample(levels(dta$area), 25)
ggplot(data = dta[dta$schID %in% ns25, ], aes(x = chem_centered, y = sciscore, color = gender))+
geom_point(size = 1) +
stat_smooth(method = 'lm', se = F) +
facet_wrap( ~ schID)
## `geom_smooth()` using formula = 'y ~ x'
重看一次各區域以化學預測總分時的截距與斜率
ggplot(data = dta[dta$area %in% nr25, ], aes(x = chem_centered, y = sciscore, color = gender))+
geom_point(size = 1) +
stat_smooth(method = 'lm', se = F) +
facet_wrap( ~ area )
## `geom_smooth()` using formula = 'y ~ x'
底下是第三部份,診斷(檢查假設)
- 診斷就是要看,資料是否與分析模型的假設一致
- 如果一致,我們對分析結果會比較有信心
- 如果不一致,需要修正模型、或是說明分析結果時要小心
學校與區域層次的隨機效果QQ圖,檢查隨機效果是否呈常態
qq_r21 <- qqmath(~ ranef(m0, condVar = T)$schID, type = c('p', 'g', 'r'), pch = '.',xlab = '常態分位數', ylab = '學校隨機效果分位數')
qq_r22 <- qqmath(~ ranef(m0, condVar = T)$area, type = c('p', 'g', 'r'), pch = '.',xlab = '常態分位數', ylab = '區域隨機效果分位數')
殘差的QQ圖,檢驗殘差是否呈常態
qq_r0 <- qqmath(~ resid(m0, scale = T), type = c('p', 'g', 'r'), pch = '.',xlab = '標準化常態分位數', ylab = '標準化殘差')
預測值對殘差圖,檢查殘差與獨變項無關的假設
m0_f <- fortify.merMod(m0)
r_m0 <- ggplot(data = m0_f, aes(x = .fitted, y = .scresid)) +
geom_point(pch = '.') +
stat_smooth(method = 'loess', se = F) +
labs( x = '預測值', y = '標準化殘差')
把圖放在一起呈現
grid.arrange(qq_r21, qq_r22, qq_r0, r_m0, nrow = 2, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'