knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
include = TRUE,
error = TRUE,
fig.width = 8,
fig.height = 4
)
Load dependencies and preprocessed datasets
library(tidyverse)
library(ggrepel)
library(brms)
library(tidybayes)
load("../unshareable_data/preprocessed/tl.Rda")
load("data/preprocessed/census.Rda")
sim_pop_sample_with_draws <- readRDS("data/simulated/sim_pop_sample_with_draws.rds")
options(mc.cores = 4,
brms.backend = "cmdstanr")
options(scipen = 999,
digits = 4)
# windowsFonts(Times = windowsFont("Times New Roman"))
theme_set(theme_minimal(base_size = 12, base_family = "Times"))
# exclude first twins to avoid twin dependency issues
tl_no_1st_twins <- tl %>%
filter(ptyp != 1)
In order to clearly see the contribution of poststratification as opposed to only making predictions based on the regression model (as is done in continuous norming and similar approaches for example), here we compare raw and MRP means to predictions from the regularised prediction model without poststratification (MR). We did this using a function at an earlier stage of the project to facilitate comparing several models:
mrp_age_norms <- function(ps_table, ps_variables, brm) {
norming_sample <- brm$data
sim_pop_sample <- ps_table %>%
filter(census_n != 0) %>%
ungroup() %>%
sample_n(size = 100000, # 67738120 is total census_n in PS table
weight = census_n,
replace = TRUE) %>%
select(all_of(ps_variables))
sim_pop_sample_with_draws <- sim_pop_sample %>%
add_predicted_draws(brm, ndraws = 1000, seed = 810,
allow_new_levels = T) %>%
mutate(.prediction = case_when(.prediction < 0 ~ 0, # censor predictions which go over scale limits
.prediction > 56 ~ 56,
TRUE ~ .prediction))
means_sds_and_ses_MRP <- sim_pop_sample_with_draws %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction), sd_prediction = sd(.prediction)) %>%
summarise(MRP_mean = mean(mean_prediction),
MRP_seOfmean = sd(mean_prediction),
MRP_sd = sqrt(mean(sd_prediction^2)),
MRP_seOfsd = sd(sd_prediction))
sim_norming_sample <- brm$data %>%
select(all_of(ps_variables)) %>%
mutate(age = floor(age)) %>%
group_by_all() %>%
summarise(Raw_n = n()) %>%
ungroup() %>%
sample_n(size = 100000, weight = Raw_n, replace = TRUE)
sim_norming_sample_with_draws <- sim_norming_sample %>%
select(all_of(ps_variables)) %>%
add_predicted_draws(brm, ndraws = 1000, seed = 810,
allow_new_levels = T) %>%
mutate(.prediction = case_when(.prediction < 0 ~ 0, # censor predictions which go over scale limits
.prediction > 56 ~ 56,
TRUE ~ .prediction))
means_sds_and_ses_MR <- sim_norming_sample_with_draws %>%
mutate(age = floor(age)) %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction), sd_prediction = sd(.prediction)) %>%
summarise(MR_mean = mean(mean_prediction),
MR_seOfmean = sd(mean_prediction),
MR_sd = sqrt(mean(sd_prediction^2)),
MR_seOfsd = sd(sd_prediction))
means_sds_and_ses_raw <- norming_sample %>%
mutate(age = floor(age)) %>%
group_by(age) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = TRUE),
Raw_sd = sd(cft, na.rm = TRUE),
Raw_seOfmean = Raw_sd/sqrt(Raw_n))
means_ns_sds_and_ses <- means_sds_and_ses_MRP %>%
left_join(means_sds_and_ses_MR, by = c("age")) %>%
left_join(means_sds_and_ses_raw, by = "age") %>%
filter(age <= 65) %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "(.*)_(.*)")
means_plot <- means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*seOfmean, ymax = mean + 1.96*seOfmean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65),
aes(label = source), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#cc79a7", "#0072b2", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
SDs_plot <- means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = sd, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = sd - 1.96*seOfsd, ymax = sd + 1.96*seOfsd),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65),
aes(label = source), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "x") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#cc79a7", "#0072b2", "#e69f00")) +
labs(x = "Age", y = "CFT 20-R Score SD")
SEs_plot <- means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = seOfmean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_point(size = 3, shape = 18) +
geom_text(
data = means_ns_sds_and_ses %>% filter(age == 65),
aes(label = source), family = "Times", nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#cc79a7", "#0072b2", "#e69f00")) +
labs(x = "Age", y = "SE of Mean")
mrp_overall_estimates <- sim_pop_sample_with_draws %>%
filter(age <= 65) %>%
group_by(.draw) %>%
summarise(mean_prediction = mean(.prediction), sd_prediction = sd(.prediction)) %>%
summarise("MRP Mean" = mean(mean_prediction),
"MRP SE of Mean" = sd(mean_prediction),
"MRP SD" = sqrt(mean(sd_prediction^2)),
"MRP SE of SD" = sd(sd_prediction))
mr_overall_estimates <- sim_norming_sample_with_draws %>%
filter(age <= 65) %>%
group_by(.draw) %>%
summarise(mean_prediction = mean(.prediction), sd_prediction = sd(.prediction)) %>%
summarise("MR Mean" = mean(mean_prediction),
"MR SE of Mean" = sd(mean_prediction),
"MR SD" = sqrt(mean(sd_prediction^2)),
"MR SE of SD" = sd(sd_prediction))
return(list(
means_plot = means_plot,
SDs_plot = SDs_plot,
SEs_plot = SEs_plot,
mrp_overall_estimates = mrp_overall_estimates,
mr_overall_estimates = mr_overall_estimates))
}
brm_age_by_educ <-
brm(bf(
cft ~ (1 | mig) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age, by = educ),
sigma ~ (1 | mig) + (1 | educ) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age)),
chains = 4,
iter = 5000,
family = gaussian(),
seed = 810,
control = list(adapt_delta = 0.99),
file = "../unshareable_data/brms/brm_age_by_educ_18_delta_99_5000",
file_refit = "never",
data = tl_no_1st_twins)
mrp_age_norms(ps_table = census,
ps_variables = c("age", "educ", "mig", "male"),
brm = brm_age_by_educ)
## $means_plot
##
## $SDs_plot
##
## $SEs_plot
##
## $mrp_overall_estimates
## # A tibble: 1 × 4
## `MRP Mean` `MRP SE of Mean` `MRP SD` `MRP SE of SD`
## <dbl> <dbl> <dbl> <dbl>
## 1 35.7 0.107 8.53 0.0883
##
## $mr_overall_estimates
## # A tibble: 1 × 4
## `MR Mean` `MR SE of Mean` `MR SD` `MR SE of SD`
## <dbl> <dbl> <dbl> <dbl>
## 1 37.4 0.0704 8.14 0.0560
Besides the obvious effect that poststratification has on means, it also smoothes SD estimates and considerably reduces SEs of both means and SDs.
Weighting is often undertaken using simple averages instead of model predictions. Here we apply poststratification weights we calculated based on population data and use weights provided by TwinLife and compare the results to MRP and raw data.
means_sds_and_ses_MRP <- sim_pop_sample_with_draws %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP_mean = mean(mean_prediction),
MRP_se_of_mean = sd(mean_prediction),
MRP_sd = sqrt(mean(sd_prediction^2)),
MRP_se_of_sd = sd(sd_prediction)) %>%
filter(age <= 65)
means_sds_and_ses_tl_ps <- tl_no_1st_twins %>%
mutate(age = age0100) %>%
left_join((census %>%
filter(age <= 65) %>%
mutate(PS_w = census_n/sum(census_n))), by = c("age", "male", "mig", "educ")) %>%
filter(age <= 65) %>%
# set PS weights of cells that exist in the sample but not in the census to 0
mutate(PS_w = ifelse(is.na(PS_w), 0, PS_w)) %>%
group_by(age) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n),
PS_mean = weighted.mean(cft, PS_w, na.rm = T),
PS_sd = sqrt(weighted.mean((cft - PS_mean)^2, PS_w, na.rm = T)),
PS_se_of_mean = PS_sd/sqrt(Raw_n))
means_ns_sds_and_ses <- means_sds_and_ses_MRP %>%
left_join(means_sds_and_ses_tl_ps, by = "age") %>%
pivot_longer(-c(age), names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses
%>% filter(age == 65) %>% arrange(source),
aes(label = c("MRP", "P", "Raw")),
family = "Times", seed = 810, nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#0072b2", "#b66dff", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
means_ns_sds_and_ses %>% group_by(source) %>% summarise(means = mean(mean), ses = mean(se_of_mean))
## # A tibble: 3 × 3
## source means ses
## <chr> <dbl> <dbl>
## 1 MRP 35.6 0.315
## 2 PS 36.9 0.800
## 3 Raw 36.8 0.845
census_educ_only <- census %>%
group_by(educ) %>%
summarise(census_n = sum(census_n)) %>%
ungroup()
means_sds_and_ses_tl_ps <- tl_no_1st_twins %>%
mutate(age = age0100) %>%
left_join((census_educ_only
%>% mutate(PS_w = census_n/sum(census_n))), by = c("educ")) %>%
group_by(age) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n),
P_mean = weighted.mean(cft, PS_w, na.rm = T),
P_sd = sqrt(weighted.mean((cft - P_mean)^2, PS_w, na.rm = T)),
P_se_of_mean = P_sd/sqrt(Raw_n)) %>%
filter(!is.na(age))
means_ns_sds_and_ses <- means_sds_and_ses_MRP %>%
left_join(means_sds_and_ses_tl_ps, by = "age") %>%
pivot_longer(-c(age), names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses
%>% filter(age == 65) %>% arrange(source),
aes(label = c("MRP", "P", "Raw")),
family = "Times", seed = 810, nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#0072b2", "#b66dff", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
means_ns_sds_and_ses %>% group_by(source) %>% summarise(means = mean(mean), ses = mean(se_of_mean))
## # A tibble: 3 × 3
## source means ses
## <chr> <dbl> <dbl>
## 1 MRP 35.6 0.315
## 2 P 35.8 0.826
## 3 Raw 36.8 0.845
means_sds_and_ses_MRP <- sim_pop_sample_with_draws %>%
mutate(age_group = case_when(
age == 11 ~ '11',
age == 12 ~ '12',
age == 13 ~ '13',
age == 14 ~ '14',
age == 15 ~ '15',
age == 16 ~ '16',
age >= 17 & age <= 19 ~ '17-19',
age >= 20 & age <= 24 ~ '20-24',
age >= 25 & age <= 29 ~ '25-29',
age >= 30 & age <= 34 ~ '30-34',
age >= 35 & age <= 39 ~ '35-39',
age >= 40 & age <= 44 ~ '40-44',
age >= 45 & age <= 49 ~ '45-49',
age >= 50 & age <= 54 ~ '50-54',
age >= 55 & age <= 59 ~ '55-59',
age >= 60 & age <= 64 ~ '60-64',
TRUE ~ NA_character_)) %>%
group_by(age_group, .draw) %>%
summarise(mean_prediction = mean(.prediction), sd_prediction = sd(.prediction)) %>%
summarise(MRP_mean = mean(mean_prediction),
MRP_se_of_mean = sd(mean_prediction),
MRP_sd = sqrt(mean(sd_prediction^2)),
MRP_se_of_sd = sd(sd_prediction)) %>%
filter(!is.na(age_group))
census_manual_age_groups <- census %>%
mutate(age_group = case_when(
age == 11 ~ '11',
age == 12 ~ '12',
age == 13 ~ '13',
age == 14 ~ '14',
age == 15 ~ '15',
age == 16 ~ '16',
age >= 17 & age <= 19 ~ '17-19',
age >= 20 & age <= 24 ~ '20-24',
age >= 25 & age <= 29 ~ '25-29',
age >= 30 & age <= 34 ~ '30-34',
age >= 35 & age <= 39 ~ '35-39',
age >= 40 & age <= 44 ~ '40-44',
age >= 45 & age <= 49 ~ '45-49',
age >= 50 & age <= 54 ~ '50-54',
age >= 55 & age <= 59 ~ '55-59',
age >= 60 & age <= 64 ~ '60-64',
TRUE ~ NA_character_)) %>%
group_by(age_group, male, mig, educ) %>%
summarise(census_n = sum(census_n)) %>%
ungroup() %>%
filter(!is.na(age_group))
load("../unshareable_data/preprocessed/manual_norms.Rda")
means_sds_and_ses_tl_ps <- tl_no_1st_twins %>%
left_join((census_manual_age_groups
%>% mutate(PS_w = census_n/sum(census_n))), by = c("age_group", "male", "mig", "educ")) %>%
group_by(age_group) %>%
mutate(PS_w = ifelse(is.na(PS_w), 0, PS_w)) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n),
P_mean = weighted.mean(cft, PS_w, na.rm = T),
P_sd = sqrt(weighted.mean((cft - P_mean)^2, PS_w, na.rm = T)),
P_se_of_mean = P_sd/sqrt(Raw_n)) %>%
filter(!is.na(age_group))
means_ns_sds_and_ses_plot <- means_sds_and_ses_MRP %>%
left_join(means_sds_and_ses_tl_ps, by = "age_group")%>%
left_join(manual_norms, by = "age_group") %>%
pivot_longer(-c(age_group), names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses_plot %>%
ggplot(aes(x = age_group, y = mean, group = source, colour = source)) +
scale_x_discrete(expand = expansion(add = c(0, 2))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 6,
linewidth = 1.5,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses_plot %>% filter(age_group == "60-64"),
aes(label = c(source[1:3], "")),
family = "Times", seed = 810, nudge_x = .5, hjust = 0) +
labs(x = "Age Group", y = "Mean CFT 20-R Score", ) +
geom_text_repel(
data = means_ns_sds_and_ses_plot %>% filter(age_group == "14"),
aes(label = c("", "", "", "Manual, school sample")),
family = "Times", seed = 810, nudge_y = 2.5) +
geom_text_repel(
data = means_ns_sds_and_ses_plot %>% filter(age_group == "40-44"),
aes(label = c("", "", "", "Manual, TwinLife sample")),
family = "Times", seed = 810, nudge_y = -3) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
geom_vline(xintercept = 7.5) +
scale_colour_manual(values = c("#009e73", "#0072b2", "#b66dff", "#e69f00"))
Here we use the weights provided by the TwinLife panel study data custodians, described here. We compare them with both raw and MRP means.
tl_no_1st_twins_w_ws <- tl_no_1st_twins %>%
left_join((haven::read_dta("../unshareable_data/raw/ZA6701_weights_v7-1-0.dta") %>%
select(c(fid, svw0100, svw0200))), by = "fid") %>%
# rename the weights
mutate(dw = svw0100, # design weights
nrw = svw0200, # non-response weights
dnrw = svw0100*svw0200) # total weights (both weights together)
means_sds_and_ses_MRP <- sim_pop_sample_with_draws %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP_mean = mean(mean_prediction),
MRP_se_of_mean = sd(mean_prediction),
MRP_sd = sqrt(mean(sd_prediction^2)),
MRP_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl <- tl_no_1st_twins_w_ws %>%
group_by(age0100) %>%
summarise(n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(n),
DesignWeighted_mean = weighted.mean(cft, dw, na.rm = T),
DesignWeighted_sd = sqrt(weighted.mean((cft - DesignWeighted_mean)^2, dw, na.rm = T)),
DesignWeighted_se_of_mean = DesignWeighted_sd/sqrt(n),
NonresponseWeighted_mean = weighted.mean(cft, nrw, na.rm = T),
NonresponseWeighted_sd = sqrt(weighted.mean((cft - NonresponseWeighted_mean)^2, nrw, na.rm = T)),
NonresponseWeighted_se_of_mean = NonresponseWeighted_sd/sqrt(n),
TotalWeighted_mean = weighted.mean(cft, dnrw, na.rm = T),
TotalWeighted_sd = sqrt(weighted.mean((cft - TotalWeighted_mean)^2, dnrw, na.rm = T)),
TotalWeighted_se_of_mean = TotalWeighted_sd/sqrt(n)) %>%
rename(age = age0100)
means_ns_sds_and_ses <- means_sds_and_ses_MRP %>%
left_join(means_sds_and_ses_tl, by = "age") %>%
filter(age <= 65) %>%
pivot_longer(-c(age, n), names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = c("Design weighted", "MRP", "Non-response weighted", "Raw", "Total weighted")), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#490092", "#0072b2", "#920000","#e69f00","#24ff24")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
No large differences to raw means but it’s hard to see details.
means_ns_sds_and_ses %>%
group_by(source) %>%
summarise(mean = mean(mean, na.rm = TRUE)) %>%
arrange(mean)
## # A tibble: 5 × 2
## source mean
## <chr> <dbl>
## 1 MRP 35.6
## 2 NonresponseWeighted 36.4
## 3 TotalWeighted 36.5
## 4 Raw 36.8
## 5 DesignWeighted 36.9
Overall, non response weighting also recognised an overestimation and corrected for it, but not as much as MRP.
Since the previous plot was too cluttered, here’s one with non-response only since it’s the one that seems to make the largest difference to raw means.
means_ns_sds_and_ses %>%
filter(source != "DesignWeighted" & source != "TotalWeighted") %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text(
data = means_ns_sds_and_ses
%>% filter(source != "DesignWeighted" & source != "TotalWeighted")
%>% filter(age == 65),
aes(label = c("MRP", "Raw", "Non-response weighted")),
family = "Times", seed = 810, nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#0072b2", "#920000", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = sd, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = sd - 1.96*se_of_sd, ymax = sd + 1.96*se_of_sd),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = c("Design weighted", "MRP", "Non-response weighted", "Raw", "Total weighted")),
family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#490092", "#0072b2", "#920000","#e69f00","#24ff24")) +
labs(x = "Age", y = "CFT 20-R Score SD")
means_ns_sds_and_ses %>%
group_by(source) %>%
summarise(sd = sqrt(mean(sd^2))) %>%
arrange(sd)
## # A tibble: 5 × 2
## source sd
## <chr> <dbl>
## 1 DesignWeighted 7.87
## 2 TotalWeighted 8.07
## 3 Raw 8.12
## 4 MRP 8.19
## 5 NonresponseWeighted 8.23
No large differences in SDs.
Our main analysis exludes 1st twins to avoid having to deal with the twins’ dependent observations and because we used the data of the 1st twins extensively for model comparison. Here we test our results’ robustsness against including them.
brm_age_by_educ_with_1st_twins <-
brm(bf(
cft ~ (1 | mig) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age, by = educ),
sigma ~ (1 | mig) + (1 | educ) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age)),
family = gaussian(),
chains = 4,
iter = 5000,
seed = 810,
control = list(adapt_delta = 0.99),
file = "../unshareable_data/brms/brm_age_by_educ_18_delta_99_5000_with_1st_twins",
file_refit = "never",
data = tl) %>%
add_criterion("loo")
means_sds_and_ses_MRP1 <- sim_pop_sample_with_draws %>% # the one generated in the tutorial
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP1_mean = mean(mean_prediction),
MRP1_se_of_mean = sd(mean_prediction),
MRP1_sd = sqrt(mean(sd_prediction^2)),
MRP1_se_of_sd = sd(sd_prediction))
sim_pop_sample2 <- census %>%
sample_n(size = 100000,
weight = census_n,
replace = TRUE)
sim_pop_sample_with_draws2 <- sim_pop_sample2 %>%
add_predicted_draws(brm_age_by_educ_with_1st_twins, # based on the 1/family model
ndraws = 1000,
seed = 810,
allow_new_levels = TRUE) %>%
mutate(.prediction = round(case_when(.prediction < 0 ~ 0,
.prediction > 56 ~ 56,
TRUE ~ .prediction)))
means_sds_and_ses_MRP2 <- sim_pop_sample_with_draws2 %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP2_mean = mean(mean_prediction),
MRP2_se_of_mean = sd(mean_prediction),
MRP2_sd = sqrt(mean(sd_prediction^2)),
MRP2_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl <- tl %>%
group_by(age0100) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n)) %>%
rename(age = age0100)
means_ns_sds_and_ses <- means_sds_and_ses_MRP1 %>%
left_join(means_sds_and_ses_MRP2, by = c("age")) %>%
left_join(means_sds_and_ses_tl, by = "age") %>%
filter(age <= 65)
means_ns_sds_and_ses_plot <- means_ns_sds_and_ses %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses_plot %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses_plot %>% filter(age == 65) %>% arrange(source),
aes(label = c("MRP, excl. 1st twins", "MRP, incl. 1st twins", "Raw")), family = "Times", seed = 810, nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#0072b2", "#000000", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
means_ns_sds_and_ses <- means_ns_sds_and_ses %>%
mutate(dif_mrp_raw_abs = abs(MRP1_mean - Raw_mean),
dif_mrp_raw = MRP1_mean - Raw_mean,
dif_mrps = MRP1_mean - MRP2_mean,
dif_mrps_abs = abs(MRP1_mean - MRP2_mean))
mean(means_ns_sds_and_ses$dif_mrp_raw, na.rm=T)
## [1] -1.194
mean(means_ns_sds_and_ses$dif_mrp_raw_abs, na.rm=T)
## [1] 1.608
mean(means_ns_sds_and_ses$dif_mrps)
## [1] 0.00271
mean(means_ns_sds_and_ses$dif_mrps_abs)
## [1] 0.1411
Since the TwinLife sample is composed of families, one could argue that the dependence among estimates of family members violates the i.i.d assumption and leads to underestimation of SEs. To check the robustness of our results against this violation, we ran the same MRP but with only one person chosen randomly out of each family, thus eliminating the dependency.
set.seed(14)
tl_no_1st_twins_1_per_fid <- tl_no_1st_twins %>%
group_by(fid) %>%
sample_n(1)
brm_age_by_educ_1_per_fid <-
brm(bf(
cft ~ (1 | mig) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age, by = educ),
sigma ~ (1 | mig) + (1 | educ) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age)),
chains = 4,
iter = 6000,
family = gaussian(),
seed = 810,
control = list(adapt_delta = 0.99),
file = "../unshareable_data/brms/brm_age_by_educ_1_per_fid",
file_refit = "never",
data = tl_no_1st_twins_1_per_fid) %>%
add_criterion("loo")
means_sds_and_ses_MRP1 <- sim_pop_sample_with_draws %>% # the one generated in the tutorial
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP1_mean = mean(mean_prediction),
MRP1_se_of_mean = sd(mean_prediction),
MRP1_sd = sqrt(mean(sd_prediction^2)),
MRP1_se_of_sd = sd(sd_prediction))
sim_pop_sample2 <- census %>%
sample_n(size = 100000,
weight = census_n,
replace = TRUE)
sim_pop_sample_with_draws2 <- sim_pop_sample2 %>%
add_predicted_draws(brm_age_by_educ_1_per_fid, # based on the 1/family model
ndraws = 1000,
seed = 810,
allow_new_levels = TRUE) %>%
mutate(.prediction = round(case_when(.prediction < 0 ~ 0,
.prediction > 56 ~ 56,
TRUE ~ .prediction)))
means_sds_and_ses_MRP2 <- sim_pop_sample_with_draws2 %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP2_mean = mean(mean_prediction),
MRP2_se_of_mean = sd(mean_prediction),
MRP2_sd = sqrt(mean(sd_prediction^2)),
MRP2_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl <- tl_no_1st_twins_1_per_fid %>%
group_by(age0100) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n)) %>%
rename(age = age0100)
means_ns_sds_and_ses <- means_sds_and_ses_MRP1 %>%
left_join(means_sds_and_ses_MRP2, by = c("age")) %>%
left_join(means_sds_and_ses_tl, by = "age") %>%
filter(age <= 65)
means_ns_sds_and_ses_plot <- means_ns_sds_and_ses %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses_plot %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text(
data = means_ns_sds_and_ses_plot %>% filter(age == 65),
aes(label = c("MRP, full sample", "MRP, 1 person/family", "Raw")),
family = "Times", nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#0072b2", "#56b4e9", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
means_ns_sds_and_ses <- means_ns_sds_and_ses %>%
mutate(dif_mrp_raw_abs = abs(MRP1_mean - Raw_mean),
dif_mrp_raw = MRP1_mean - Raw_mean,
dif_mrps = MRP1_mean - MRP2_mean,
dif_mrps_abs = abs(MRP1_mean - MRP2_mean))
mean(means_ns_sds_and_ses$dif_mrp_raw, na.rm=T)
## [1] -1.09
mean(means_ns_sds_and_ses$dif_mrp_raw_abs, na.rm=T)
## [1] 1.751
mean(means_ns_sds_and_ses$dif_mrps)
## [1] -0.01851
mean(means_ns_sds_and_ses$dif_mrps_abs)
## [1] 0.2698
Despite shrinking the sample size from N = 10059 to 4044, the results seem to be largely unchanged. Only the SEs slightly increased as would be expected.
Some participants were assigned an education category that combines two degrees with variable ISCED levels: “university of applied sciences, university of cooperative education”. While a degree from a university of applied sciences (Fachhochschule) would put one in ISCED level 5a, a degree from a university of cooperative education (Berufsakademie) is equivalent to ISCED level 5b. TwinLife assigns all people who have this ambivalent category to ISCED 5a. Here we test the robustness of our results against excluding those 357 participants.
tl_no_1st_twins_no_berufsakad_with_fachhochschule <- tl_no_1st_twins %>%
filter(!(eca0108 == "level 5a" & eca0230 == 8))
brm_age_by_educ_modif_isced5a <-
brm(bf(
cft ~ (1 | mig) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age, by = educ),
sigma ~ (1 | mig) + (1 | educ) + (male) + (1 | mig:male) + (1 | mig:educ) + (1 | male:educ) + (1 | mig:educ:male) + s(age)),
family = gaussian(),
chains = 4,
iter = 5000,
seed = 810,
control = list(adapt_delta = 0.99),
file = "../unshareable_data/brms/brm_age_by_educ_18_delta_99_5000_isced5a",
file_refit = "never",
data = tl_no_1st_twins_no_berufsakad_with_fachhochschule) %>%
add_criterion("loo")
means_sds_and_ses_MRP1 <- sim_pop_sample_with_draws %>% # the one generated in the tutorial
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP1_mean = mean(mean_prediction),
MRP1_se_of_mean = sd(mean_prediction),
MRP1_sd = sqrt(mean(sd_prediction^2)),
MRP1_se_of_sd = sd(sd_prediction))
sim_pop_sample2 <- census %>%
sample_n(size = 100000,
weight = census_n,
replace = TRUE)
sim_pop_sample_with_draws2 <- sim_pop_sample2 %>%
add_predicted_draws(brm_age_by_educ_modif_isced5a, # based on the 1/family model
ndraws = 1000,
seed = 810,
allow_new_levels = TRUE) %>%
mutate(.prediction = round(case_when(.prediction < 0 ~ 0,
.prediction > 56 ~ 56,
TRUE ~ .prediction)))
means_sds_and_ses_MRP2 <- sim_pop_sample_with_draws2 %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(MRP2_mean = mean(mean_prediction),
MRP2_se_of_mean = sd(mean_prediction),
MRP2_sd = sqrt(mean(sd_prediction^2)),
MRP2_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl <- tl_no_1st_twins_no_berufsakad_with_fachhochschule %>%
group_by(age0100) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n)) %>%
rename(age = age0100)
means_ns_sds_and_ses <- means_sds_and_ses_MRP1 %>%
left_join(means_sds_and_ses_MRP2, by = c("age")) %>%
left_join(means_sds_and_ses_tl, by = "age") %>%
filter(age <= 65)
means_ns_sds_and_ses_plot <- means_ns_sds_and_ses %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses_plot %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(12, 64, by = 2),
expand = expansion(add = c(1, 15))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses_plot %>% filter(age == 65) %>% arrange(source),
aes(label = c("MRP, full sample", "MRP, no BeAk/FHS", "Raw")),
family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c("#0072b2", "#924900", "#e69f00")) +
labs(x = "Age", y = "Mean CFT 20-R Score")
means_ns_sds_and_ses <- means_ns_sds_and_ses %>%
mutate(dif_mrp_raw_abs = abs(MRP1_mean - Raw_mean),
dif_mrp_raw = MRP1_mean - Raw_mean,
dif_mrps = MRP1_mean - MRP2_mean,
dif_mrps_abs = abs(MRP1_mean - MRP2_mean))
mean(means_ns_sds_and_ses$dif_mrp_raw, na.rm=T)
## [1] -1.057
mean(means_ns_sds_and_ses$dif_mrp_raw_abs, na.rm=T)
## [1] 1.516
mean(means_ns_sds_and_ses$dif_mrps)
## [1] -0.01023
mean(means_ns_sds_and_ses$dif_mrps_abs)
## [1] 0.1141
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.8 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /software/all/FlexiBLAS/3.2.1-GCC-12.2.0/lib64/libflexiblas.so.3.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] tidybayes_3.0.6 brms_2.20.4 Rcpp_1.0.11 ggrepel_0.9.4
## [5] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4
## [13] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] googledrive_2.1.1 colorspace_2.1-0 ellipsis_0.3.2
## [4] QuickJSR_1.0.8 markdown_1.11 base64enc_0.1-3
## [7] fs_1.6.3 rstudioapi_0.15.0 farver_2.1.1
## [10] rstan_2.32.3 svUnit_1.0.6 DT_0.30
## [13] fansi_1.0.5 mvtnorm_1.2-3 lubridate_1.9.3
## [16] xml2_1.3.5 splines_4.2.2 codetools_0.2-19
## [19] bridgesampling_1.1-2 cachem_1.0.8 knitr_1.45
## [22] shinythemes_1.2.0 bayesplot_1.10.0 jsonlite_1.8.7
## [25] broom_1.0.5 dbplyr_2.4.0 ggdist_3.3.0
## [28] shiny_1.8.0 compiler_4.2.2 httr_1.4.7
## [31] backports_1.4.1 Matrix_1.5-3 fastmap_1.1.1
## [34] gargle_1.5.2 cli_3.6.1 later_1.3.1
## [37] prettyunits_1.2.0 htmltools_0.5.7 tools_4.2.2
## [40] igraph_1.5.1 coda_0.19-4 gtable_0.3.4
## [43] glue_1.6.2 reshape2_1.4.4 posterior_1.5.0
## [46] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.6.4
## [49] nlme_3.1-162 crosstalk_1.2.1 tensorA_0.36.2
## [52] xfun_0.41 ps_1.7.5 rvest_1.0.3
## [55] timechange_0.2.0 mime_0.12 miniUI_0.1.1.1
## [58] lifecycle_1.0.4 renv_1.0.3 gtools_3.9.5
## [61] googlesheets4_1.1.1 zoo_1.8-12 scales_1.2.1
## [64] colourpicker_1.3.0 hms_1.1.3 promises_1.2.1
## [67] Brobdingnag_1.2-9 parallel_4.2.2 inline_0.3.19
## [70] shinystan_2.6.0 yaml_2.3.7 gridExtra_2.3
## [73] StanHeaders_2.26.28 loo_2.6.0 sass_0.4.7
## [76] stringi_1.8.2 highr_0.10 dygraphs_1.1.1.6
## [79] checkmate_2.3.0 pkgbuild_1.4.2 cmdstanr_0.6.1
## [82] rlang_1.1.2 pkgconfig_2.0.3 matrixStats_1.1.0
## [85] distributional_0.3.2 evaluate_0.23 lattice_0.22-5
## [88] labeling_0.4.3 rstantools_2.3.1.1 htmlwidgets_1.6.3
## [91] processx_3.8.2 tidyselect_1.2.0 plyr_1.8.9
## [94] magrittr_2.0.3 R6_2.5.1 generics_0.1.3
## [97] DBI_1.1.3 mgcv_1.8-42 pillar_1.9.0
## [100] haven_2.5.3 withr_2.5.2 xts_0.13.1
## [103] abind_1.4-5 modelr_0.1.11 crayon_1.5.2
## [106] arrayhelpers_1.1-0 utf8_1.2.4 tzdb_0.4.0
## [109] rmarkdown_2.25 grid_4.2.2 readxl_1.4.3
## [112] callr_3.7.3 threejs_0.3.3 reprex_2.0.2
## [115] digest_0.6.33 xtable_1.8-4 httpuv_1.6.12
## [118] RcppParallel_5.1.7 stats4_4.2.2 munsell_0.5.0
## [121] bslib_0.6.0 shinyjs_2.1.0