Simulação Regressão Linear: Imputação Múltipla
José Luiz Padilha da Silva
12 de novembro de 2019
Avaliaremos via simulação o impacto dos dados ausentes na estimação dos parâmetros de uma regressão linear de uma resposta contínuay (complementante observada) em função de um preditor binário x (potencialmente não observado) e de um preditor contínuoz (completamente observado). Ajustaremos o modelo nos dados completos, dados disponíveis e dados imputados. Para imputação, usaremos o método implementado no pacotemice. O objetivo é verificar o viés e perda de eficiência em vários cenários de dados ausentes.
Geração dos dados
set.seed(129) # Pacotes usados library(mice) library(progress) library(tidyverse) library(gridExtra) # Geração dos dadosgera_dados <- function(n=NULL){ x <- c(rep(1,n/2),rep(0,n/2))
z <- NULL; for(i in 1:n) {z[i] <- rnorm(n=1, mean=0.5*(x[i]-0.5), sd=1)} y <- NULL; for(i in 1:n) {y[i] <- rnorm(n=1, mean=0+x[i]+z[i], sd=1)} dados <- data.frame(y=y,x=x,z=z)
dados$x <- as.factor(dados$x) return(dados)
}
# Criação dos dados missing
gera_dados_m <- function(dados=NULL, psi.0=NULL, psi.y=NULL, psi.x=NULL, psi.z=NULL){ dados.m <- dados
psi.lp <- psi.0 + psi.y*dados.m$y + psi.x*I(dados.m$x==1)+psi.z*dados.m$z p <- (1+exp(-psi.lp))^(-1)
summary(p)
r <- NULL; for(i in 1:n) {r[i] <- rbinom(1,1,p[i])} dados.m$x[r==0] <- NA return(dados.m) } # Configuração da simulação total <- 1000 n <- 1000 m <- 10
Cenário (a): perda depende de Y
psi.0 <- 0; psi.y <- 1; psi.x <- 0; psi.z <- 0coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.3720 0.4060 0.4160 0.4157 0.4260 0.4630 grid.arrange(p1, p2, ncol=1, top="Perda depende de Y")
Intercept x z
comp disp imp comp disp imp comp disp imp
0.8 0.9 1.0 1.1 0.6 0.8 1.0 1.2 0.0 0.2 0.4 0.6
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.030 0.035 0.040 0.045 0.06 0.08 0.10 0.12 0.05 0.06 0.07
tipo
v
alue
Perda depende de Y
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp -0.000519 1.00 1.00 ## 2 disp 0.412 0.865 0.870 ## 3 imp -0.0000531 1.00 1.00
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0454 0.0652 0.0317 ## 2 disp 0.0624 0.0804 0.0412 ## 3 imp 0.0501 0.0841 0.0353
Cenário (b): perda depende de X
psi.0 <- 0; psi.y <- 0; psi.x <- 1; psi.z <- 0coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.3370 0.3740 0.3840 0.3837 0.3930 0.4360 grid.arrange(p1, p2, ncol=1, top="Perda depende de X")
Intercept x z
comp disp imp comp disp imp comp disp imp
0.9 1.0 1.1 0.7 0.8 0.9 1.0 1.1 1.2 1.3 −0.2 −0.1 0.0 0.1 0.2
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.030 0.035 0.040 0.045 0.06 0.07 0.08 0.09 0.10 0.05 0.06 0.07
tipo
v
alue
Perda depende de X
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.00165 0.997 1.00 ## 2 disp -0.000100 0.998 1.00 ## 3 imp -0.0701 0.997 1.00
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0454 0.0652 0.0317 ## 2 disp 0.0641 0.0845 0.0404 ## 3 imp 0.0558 0.0796 0.0339
Cenário (c): perda depende de Z
psi.0 <- 0; psi.y <- 0; psi.x <- 0; psi.z <- 1coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4500 0.4890 0.5000 0.5001 0.5100 0.5460 grid.arrange(p1, p2, ncol=1, top="Perda depende de Z")
Intercept x z
comp disp imp comp disp imp comp disp imp
0.9 1.0 1.1 0.8 1.0 1.2 −0.2 −0.1 0.0 0.1 0.2
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.03 0.04 0.05 0.06 0.08 0.10 0.12 0.05 0.06 0.07
tipo
v
alue
Perda depende de Z
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.00192 0.999 0.999 ## 2 disp 0.00300 1.000 0.998 ## 3 imp 0.00244 0.998 1.00
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0455 0.0653 0.0317 ## 2 disp 0.0677 0.0924 0.0493 ## 3 imp 0.0557 0.0860 0.0360
Cenário (d): perda depende de X e Z
psi.0 <- -0.25; psi.y <- 0; psi.x <- 0.5; psi.z <- 1coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4450 0.4880 0.5000 0.4996 0.5110 0.5430
Intercept x z
comp disp imp comp disp imp comp disp imp
0.9 1.0 1.1 0.8 1.0 1.2 −0.2 −0.1 0.0 0.1 0.2
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.03 0.04 0.05 0.06 0.08 0.10 0.12 0.05 0.06 0.07 0.08
tipo
v
alue
Perda depende de X e Z
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.00127 1.00 1.00 ## 2 disp 0.00288 1.000 1.00 ## 3 imp -0.0451 0.998 1.02
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0454 0.0652 0.0317 ## 2 disp 0.0720 0.0929 0.0491 ## 3 imp 0.0587 0.0862 0.0361
Cenário (e): perda depende de Y e X
psi.0 <- -1; psi.y <- 1; psi.x <- 2; psi.z <- 0coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.3980 0.4260 0.4350 0.4347 0.4430 0.4710
Intercept x z
comp disp imp comp disp imp comp disp imp
0.8 0.9 1.0 1.1 1.2 0.50 0.75 1.00 1.25 −0.25 0.00 0.25 0.50 0.75
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.030 0.035 0.040 0.045 0.06 0.09 0.12 0.15 0.18 0.04 0.06 0.08 0.10 0.12
tipo
v
alue
Perda depende de Y e X
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.00154 0.997 1.00 ## 2 disp 0.562 0.596 0.901 ## 3 imp 0.00374 0.743 1.08
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0454 0.0652 0.0317 ## 2 disp 0.0803 0.0920 0.0419 ## 3 imp 0.0735 0.107 0.0358
Cenário (f): perda depende de Y e Z
psi.0 <- 0; psi.y <- 1; psi.x <- 0; psi.z <- 1coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.3790 0.4240 0.4350 0.4347 0.4450 0.4880
Intercept x z
comp disp imp comp disp imp comp disp imp
0.7 0.8 0.9 1.0 1.1 0.7 0.9 1.1 −0.2 0.0 0.2 0.4 0.6
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.030 0.035 0.040 0.045 0.050 0.055 0.06 0.08 0.10 0.12 0.05 0.06 0.07
tipo
v
alue
Perda depende de Y e Z
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.00139 0.998 1.000 ## 2 disp 0.402 0.890 0.801 ## 3 imp 0.00135 0.999 1.00
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0454 0.0652 0.0316 ## 2 disp 0.0674 0.0828 0.0472 ## 3 imp 0.0510 0.0845 0.0369
Cenário (g): perda depende de Y , X e Z
psi.0 <- -1; psi.y <- 1; psi.x <- 2; psi.z <- 1coef.est.comp <- coef.est.miss <- coef.est.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) coef.std.comp <- coef.std.miss <- coef.std.imp <- as.data.frame(matrix(NA, nrow=total, ncol = 3)) colnames(coef.est.comp) <- colnames(coef.est.miss) <- colnames(coef.est.imp)
<-colnames(coef.std.comp) <- colnames(coef.std.miss) <- colnames(coef.std.imp) <-c("Intercept", "x", "z")
prop.miss <- NA #
pb <- progress_bar$new(total = total) for(i in 1:total){
pb$tick()
Sys.sleep(1/total) # Ajuste dados completos dados.comp <- gera_dados(n=n)
fit.comp <- glm(y~x+z,family=gaussian,data=dados.comp) coef.est.comp[i, ] <- summary(fit.comp)$coef[,1] coef.std.comp[i, ] <- summary(fit.comp)$coef[,2] # Ajuste dados missing
dados.miss <- gera_dados_m(dados=dados.comp, psi.0=psi.0, psi.y=psi.y, psi.x=psi.x, psi.z=psi.z) prop.miss[i] <- mean(is.na(dados.miss$x))
fit.miss <- glm(y~x+z,family=gaussian,data=dados.miss) coef.est.miss[i, ] <- summary(fit.miss)$coef[,1] coef.std.miss[i, ] <- summary(fit.miss)$coef[,2] # Ajuste dados imputados
dados.imp <- mice(dados.miss,m=m,printFlag=FALSE,meth=c('','logreg','')) fit.imp <- pool(with(dados.imp, glm(y~x+z,family=gaussian)))
coef.est.imp[i, ] <- summary(fit.imp)[,1] coef.std.imp[i, ] <- summary(fit.imp)[,2] }
#
coef.est <- rbind(gather(coef.est.comp), gather(coef.est.miss), gather(coef.est.imp)) coef.std <- rbind(gather(coef.std.comp), gather(coef.std.miss), gather(coef.std.imp)) coef.est$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
coef.std$tipo <- rep(c("comp", "disp", "imp"), each=total*3)
p1 <- ggplot(coef.est, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
p2 <- ggplot(coef.std, aes(x=tipo, y=value, fill=tipo)) + geom_boxplot() + guides(fill=FALSE) + theme_bw() + facet_wrap(~key, scales = "free_y")
summary(prop.miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.4020 0.4370 0.4460 0.4461 0.4560 0.4840
Intercept x z
comp disp imp comp disp imp comp disp imp
0.7 0.8 0.9 1.0 1.1 1.2 0.4 0.6 0.8 1.0 1.2 −0.25 0.00 0.25 0.50 0.75
tipo
v
alue
Intercept x zcomp disp imp comp disp imp comp disp imp
0.030 0.035 0.040 0.045 0.050 0.055 0.06 0.09 0.12 0.15 0.04 0.06 0.08 0.10 0.12
tipo
v
alue
Perda depende de Y, X e Z
coef.est %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp -0.000473 1.00 0.999 ## 2 disp 0.506 0.689 0.836 ## 3 imp -0.0413 0.818 1.10
coef.std %>% group_by(key, tipo) %>% summarise(mean = mean(value)) %>% spread(key, mean) ## # A tibble: 3 x 4 ## tipo Intercept x z ## <chr> <dbl> <dbl> <dbl> ## 1 comp 0.0454 0.0652 0.0317 ## 2 disp 0.0831 0.0921 0.0470 ## 3 imp 0.0700 0.100 0.0364