knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
include = TRUE,
error = TRUE,
fig.width = 8,
fig.height = 4
)
library(tidyverse)
library(haven)
library(ggrepel)
library(scales)
library(kableExtra)
library(rstan)
library(kableExtra)
library(gamlss)
library(gamlss.tr)
library(ggnewscale)
options(mc.cores = 4,
brms.backend = "cmdstanr",
scipen = 999,
digits = 4,
width = 120)
theme_set(theme_minimal(base_size = 12, base_family = "Times"))
load("../unshareable_data/preprocessed/tl.Rda")
load("../unshareable_data/preprocessed/manual_norms.Rda")
load("data/preprocessed/de_census/census.Rda")
load("data/preprocessed/de_census/census_margins.Rda")
source("age_norm_comparisons.R")
One plot for age and one for the three other variables.
# Custom sorting function
custom_sort <- function(x) {
st_categories <- x[startsWith(x, "ST")]
isced_categories <- x[startsWith(x, "ISCED")]
other_categories <- x[!startsWith(x, "ST") & !startsWith(x, "ISCED")]
factor(x, levels = c(sort(st_categories), sort(isced_categories), other_categories))
}
# get the marginal distributions from the census and the TL sample
disparities_plot <- census %>%
full_join(tl %>%
group_by(age = age0100, male, educ, mig) %>%
summarise(sample_n = n()) %>%
mutate(sample_n = as.numeric(sample_n)), by = c("age", "male", "educ", "mig")) %>%
mutate(sample_n = replace_na(sample_n, 0)) %>%
pivot_longer(cols = c(census_n:sample_n), names_to = "source", values_to = "n") %>%
mutate(age = as.character(age),
male = as.character(male),
source = str_sub(source, 1, -3)) %>%
pivot_longer(cols = -c(n, source), names_to = "variable", values_to = "category") %>%
group_by(source, variable, category) %>%
summarise(n = sum(n)) %>%
mutate(percentage = n/sum(n)*100) %>%
filter(source == "sample") %>%
bind_rows(census_margins) %>%
mutate(category = case_when(
category == "TRUE" ~ "Male",
category == "FALSE" ~ "Female",
TRUE ~ category)) %>%
mutate(category = custom_sort(category))
# plot age disparities
disparities_plot %>%
filter(variable == "age") %>%
ggplot(aes(x = category, y = percentage, group = source, colour = source, label = round(percentage, digits = 1))) +
geom_line(linewidth = 1) +
facet_grid(cols = vars(variable), scales = "free", space = 'free', as.table = FALSE,
labeller = labeller(variable = function(variable) {
case_when(
variable == "age" ~ "Age",
TRUE ~ as.character(variable))})) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = c(1, 1.08), #
legend.justification = c(1, 1),
legend.background = element_rect(fill = "transparent", color = NA),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linewidth = 0.3, linetype = "dashed")) + # Changed size to linewidth
labs(x = "", y = "", colour = "", label = "Percentage") + # Removed "Percentage" from y-axis label
scale_y_continuous(breaks = seq(0, 100, by = 2), labels = label_percent(scale = 1)) + # Set breaks and percentage labels
scale_colour_manual(values = c("#0072B5FF", "#BC3C29FF"), labels = function(x) str_to_title(x)) +
scale_x_discrete(breaks = seq(11, 65, by = 2))
ggsave("figures/02_disparities_age.jpeg", width = 9, height = 3)
disparities_plot %>%
filter(variable != "age") %>%
ggplot(aes(x = category, y = percentage, group = source, colour = source, label = round(percentage, digits = 1))) +
geom_line(linewidth = 1) +
facet_grid(
cols = vars(variable),
scales = "free",
space = 'free',
as.table = FALSE,
labeller = labeller(variable = function(variable) {
case_when(
variable == "educ" ~ "School Type / ISCED 1997 Code",
variable == "male" ~ "Sex",
variable == "mig" ~ "Migration",
TRUE ~ as.character(variable))})) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = c(1, 1.1), #
legend.justification = c(1, 1),
legend.background = element_rect(fill = "transparent", color = NA),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", linewidth = 0.3)) + # Changed size to linewidth
labs(x = "Category", y = "", colour = "", label = "Percentage") + # Changed y-axis label to empty
scale_y_continuous(labels = label_percent(scale = 1)) + # Use percentage labels
scale_colour_manual(values = c("#0072B5FF", "#BC3C29FF"), labels = function(x) str_to_title(x)) +
geom_text_repel(family = "Times", seed = 14, point.padding = 4, direction = "y", hjust = 0, size = 3)
ggsave("figures/02_disparities_other_vars.jpeg", width = 8, height = 4)
Besides the obvious age disparities due to the cohort design adopted to collect the data for the TwinLife sample, we also see that our sample is “overeducated” in the sense that for example we have too many kids who were going to the Gymnasium (ST4: upper secondary) and too few adults who have something similar to an Ausbildungsabschluss (ISCED 3b: upper secondary, vocational). Migrants and females are also overrepresented in the sample.
You can use this interactive table to browse disparities at the level of joint distributions, ordered after magnitude. Note that this table has 5184 rows (number of subgroups/combinations of the 4 variables we use). Most of these subgroups are empty in the sample.
census %>%
full_join(tl %>%
group_by(age0100, male, educ, mig) %>%
summarise(sample_n = n()) %>%
rename(age = "age0100") %>%
mutate(sample_n = as.numeric(sample_n)), by = c("age", "male", "educ", "mig")) %>%
# set the subgroups which are empty in the sample to 0 instead of NA
mutate(sample_n = replace_na(sample_n, 0)) %>%
group_by(age, male, educ, mig) %>%
summarise(census_n = sum(census_n),
sample_n = sum(sample_n)) %>%
group_by(age) %>%
mutate(census_percentage = census_n*100/sum(census_n),
sample_percentage = sample_n*100/sum(sample_n),
dif_percentage = census_percentage-sample_percentage,
relative_dif = dif_percentage*census_n/100,
age = as.integer(age)) %>%
relocate(census_percentage, .after = "census_n") %>%
arrange(-relative_dif) %>% DT::datatable() %>%
DT::formatRound(c('dif_percentage', 'relative_dif', 'census_percentage', 'sample_percentage'), digits = 1)
brm_MAIN_nor_ints_no_educ_male <- readRDS("../unshareable_data/brms/cft/brm_MAIN_nor_ints_no_educ_male.rds", )
RPP_vs_raw <- age_norm_comparisons(
brm_MAIN_nor_ints_no_educ_male,
labels = c( "Raw", "RPP"),
prediction_transform = list(
function(x) round(pmax(0, pmin(56, x))) # for handling normal predictons (clip/censor/bound and round)
),
palette = c(
"#BC3C29FF",
"#0072B5FF"
),
output_file = "data/results/RPP_vs_raw.rds"
)
RPP_vs_raw[-1]
## $overall_estimates
## # A tibble: 2 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 37.4 NA 8.19 NA Raw
## 2 35.7 0.115 8.59 0.0989 RPP_brm_MAIN_nor_ints_no_educ_male
##
## $means_plot
##
## $SDs_plot
##
## $SEs_plot
##
## $percentile_plot
ggsave("figures/04_RPP_vs_raw_vs_RP.jpeg", RPP_vs_raw$means_plot, width = 8, height = 4)
ggsave("figures/05_RPP_vs_raw_vs_RP.jpeg", RPP_vs_raw$SDs_plot, width = 8, height = 4)
RPP_vs_raw$overall_estimates$Mean - mean(tl$cft)
## [1] 0.000 -1.735
RPP_vs_raw$means_ns_sds_and_ses %>% pivot_wider(names_from = source, names_sep = "_", values_from = -age) %>%
mutate(dif = mean_RPP - mean_Raw,
abs_dif = abs(mean_RPP - mean_Raw)) %>%
summarise(mean(dif), mean(abs_dif))
## # A tibble: 1 × 2
## `mean(dif)` `mean(abs_dif)`
## <dbl> <dbl>
## 1 -1.24 1.60
On average an absolute difference of ~ fifth of an SD between raw and corrected means.
Here we compare the RPP-based norms to the traditionally constructed norms reported in the CFT 20-R manual. We aggregate by the age groups as reported in the manual normal tables. Note that the manual means and SDs for ages 20 and older are based on the very same TwinLife sample we are using, they just use a different correction method that isn’t described in enough detail to make it reproducible for us.
sim_pop_sample_with_draws <- data.table::fread("data/results/sim_pop_sample_with_draws.csv.gz")
means_sds_and_ses_RPP <- sim_pop_sample_with_draws %>%
group_by(age_group, .draw) %>%
summarise(mean_prediction = mean(.prediction), sd_prediction = sd(.prediction)) %>%
summarise(RPP_mean = mean(mean_prediction),
RPP_se_of_mean = sd(mean_prediction),
RPP_sd = sqrt(mean(sd_prediction^2)),
RPP_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl <- tl %>%
filter(!is.na(age_group)) %>%
group_by(age_group) %>%
summarise(Raw_n = n(),
Raw_mean = mean(cft, na.rm = T),
Raw_sd = sd(cft, na.rm = T),
Raw_se_of_mean = Raw_sd/sqrt(Raw_n))
means_sds_and_ses <- means_sds_and_ses_RPP %>%
left_join(means_sds_and_ses_tl, by = "age_group") %>%
left_join(manual_norms, by = "age_group") %>%
filter(!is.na(Raw_n))
means_sds_and_ses_plot <- means_sds_and_ses %>%
pivot_longer(-age_group, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
# means and CIs
means_sds_and_ses_plot %>%
ggplot(aes(x = age_group, y = mean, group = source, colour = source)) +
scale_x_discrete(expand = expansion(add = c(0, 2))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 6,
linewidth = 1.5,
position = position_dodge(width = 0.3)) +
scale_colour_manual(values = c("#20854EFF", "#BC3C29FF", "#0072B5FF")) +
geom_text_repel(
data = means_sds_and_ses_plot %>% filter(age_group == "60-64"),
aes(label = c("RPP", "Raw", "")),
family = "Times", seed = 810, nudge_x = .5, hjust = 0) +
labs(x = "Age Group", y = "Mean CFT 20-R Score") +
geom_text_repel(
data = means_sds_and_ses_plot %>% filter(age_group == "14"),
aes(label = c("", "", "Manual, school sample")),
family = "Times", seed = 810, nudge_y = 2.5) +
geom_text_repel(
data = means_sds_and_ses_plot %>% filter(age_group == "40-44"),
aes(label = c("", "", "Manual, TwinLife sample")),
family = "Times", seed = 810, nudge_y = -2.5) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
geom_vline(xintercept = 7.5)
ggsave("figures/06_vs_Manual_means.jpeg", width = 8, height = 4)
means_sds_and_ses_plot %>%
ggplot(aes(x = age_group, y = sd, group = source, colour = source)) +
scale_x_discrete(expand = expansion(add = c(0, 2))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = sd - 1.96*se_of_sd, ymax = sd + 1.96*se_of_sd),
fatten = 6,
linewidth = 1.5,
position = position_dodge(width = 0.3)) +
scale_colour_manual(values = c("#20854EFF", "#BC3C29FF", "#0072B5FF")) +
geom_text_repel(
data = means_sds_and_ses_plot %>% filter(age_group == "60-64"),
aes(label = c("RPP", "Raw", "")),
family = "Times", seed = 810, nudge_x = .5, hjust = 0) +
labs(x = "Age Group", y = "Mean CFT 20-R Score") +
geom_text_repel(
data = means_sds_and_ses_plot %>% filter(age_group == "14"),
aes(label = c("", "", "Manual, school sample")),
family = "Times", seed = 810, nudge_y = -.5, nudge_x = -.5) +
geom_text_repel(
data = means_sds_and_ses_plot %>% filter(age_group == "40-44"),
aes(label = c("", "", "Manual, TwinLife sample")),
family = "Times", seed = 810, nudge_y = .5) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
geom_vline(xintercept = 7.5) +
labs(x = "Age Group", y = "CFT 20-R Score SD")
ggsave("figures/07_vs_Manual_SDs.jpeg", width = 8, height = 4)
Rather than calculating IQs for all possible CFT 20-R scores (0-56), we restrict our calculations to those which occur in our data.
iq <- function(cft_score, mean, sd) {
iq_score <- ((cft_score - mean) / sd) * 15 + 100
return(iq_score)
}
iqs_linear <- tl %>%
filter(!is.na(age_group)) %>%
select(age_group, raw_score = cft) %>%
left_join(means_sds_and_ses, by = "age_group") %>%
group_by(age_group) %>%
mutate(RPP_IQ_linear = iq(raw_score, RPP_mean, RPP_sd),
Manual_IQ_linear = iq(raw_score, Manual_mean, Manual_sd)) %>%
group_by(age_group, raw_score) %>%
summarise(RPP_IQ_linear = round(mean(RPP_IQ_linear)),
Manual_IQ_linear = round(mean(Manual_IQ_linear)),
IQ_linear_dif = RPP_IQ_linear - Manual_IQ_linear,
IQ_linear_dif_abs = abs(RPP_IQ_linear - Manual_IQ_linear))
iqs_normalised <- sim_pop_sample_with_draws %>%
filter(!is.na(age_group)) %>%
group_by(age_group, .draw) %>%
mutate(n = n(),
normal_transformed_score = qnorm((rank(.prediction) - 0.5) / n)) %>%
mutate(iq_score = normal_transformed_score * 15 + 100) %>%
group_by(age_group, raw_score = .prediction) %>%
summarise(RPP_IQ_normalised = round(mean(iq_score)),
RPP_IQ_normalised_se = sd(iq_score))
iq_table <- iqs_linear %>%
left_join(iqs_normalised, by = c("age_group", "raw_score")) %>%
mutate(IQ_RPP_dif = RPP_IQ_linear - RPP_IQ_normalised,
IQ_RPP_abs_dif = abs(RPP_IQ_linear - RPP_IQ_normalised),
Manual_kids_sample = ifelse(age_group %in% c("11", "12", "13", "14", "15", "16", "17-19"), TRUE, FALSE))
mean(iq_table$IQ_linear_dif_abs)
## [1] 4.218
mean(iq_table$IQ_linear_dif)
## [1] 1.943
max(iq_table$IQ_linear_dif_abs)
## [1] 19
mean(iq_table$IQ_RPP_abs_dif)
## [1] 0.9742
iq_table %>% group_by(Manual_kids_sample) %>% summarise(mean(IQ_linear_dif), mean(IQ_linear_dif_abs))
## # A tibble: 2 × 3
## Manual_kids_sample `mean(IQ_linear_dif)` `mean(IQ_linear_dif_abs)`
## <lgl> <dbl> <dbl>
## 1 FALSE -1.67 2.12
## 2 TRUE 7.38 7.38
t <- tibble("Manual Normalised IQ" = c(62, 92, 135, 70, 97, 134, 76, 100, 130))
iq_table %>%
select(age_group, raw_score, "RPP Normalised IQ" = RPP_IQ_normalised, "RPP Linear IQ" = RPP_IQ_linear, "Manual Linear IQ" = Manual_IQ_linear) %>%
filter((age_group == "16" & raw_score %in% c(22, 37, 53)) |
(age_group == "30-34" & raw_score %in% c(19, 37, 52)) |
(age_group == "60-64" & raw_score %in% c(12, 30, 46))) %>%
bind_cols(t) %>%
relocate("Manual Normalised IQ", .before = "RPP Linear IQ") %>%
kable()
age_group | raw_score | RPP Normalised IQ | Manual Normalised IQ | RPP Linear IQ | Manual Linear IQ |
---|---|---|---|---|---|
16 | 22 | 74 | 62 | 73 | 56 |
16 | 37 | 100 | 92 | 101 | 91 |
16 | 53 | 132 | 135 | 130 | 129 |
30-34 | 19 | 71 | 70 | 68 | 71 |
30-34 | 37 | 99 | 97 | 100 | 101 |
30-34 | 52 | 128 | 134 | 126 | 126 |
60-64 | 12 | 68 | 76 | 66 | 73 |
60-64 | 30 | 98 | 100 | 99 | 101 |
60-64 | 46 | 129 | 130 | 128 | 126 |
Following the vignette here.
library(cNORM)
## Norm sample
norm.data <- tl %>% select(age = age0100, educ, male, mig, cft) %>%
as.data.frame() %>% mutate(age = as.numeric(age))
## Generate population margins
load("data/preprocessed/de_census/census_margins.Rda")
# Both dfs have to be class `data.frame`, `tibble` is no-go.
margins <- as.data.frame(census_margins %>%
mutate(var = variable,
level = category,
prop = percentage/100) %>%
select(var, level, prop))
## Calculate raking weights
weights <- computeWeights(data = norm.data, population.margins = margins)
## Raking converged normally after 16 iterations.
## Norming model
norm.model <- cnorm(raw = norm.data$cft,
group = norm.data$age,
weights = weights,
scale = "IQ")
## Warning in rankByGroup(raw = data$raw, group = data$group, scale = scale, : The dataset includes cases, whose
## percentile depends on less than 30 cases (minimum is 13). Please check the distribution of the cases over the grouping
## variable. The confidence of the norm scores is low in that part of the scale. Consider redividing the cases over the
## grouping variable. In cases of disorganized percentile curves after modeling, it might help to reduce the 'k'
## parameter.
## Powers of location: k = 5
## Powers of age: t = 3
## Warning in computePowers(data, k = k, t = t, silent = silent):
## Multiple R2 between the explanatory variable and the raw score is low with R2 = 0.0442600897251998. Thus, there is not much variance that can be captured by the continuous norming procedure. The models are probably unstable. You can try to reduce the powers of A indepentently from k and/or to reduce the number of age groups. To model a simple linear age effect, this means to reduce the number of groups to 2 and to set t to 1.
## Final solution: 2 terms (highest consistent model)
## R-Square Adj. = 0.901607
## Final regression model: raw ~ L1 + A3
## Regression function: raw ~ -15.23303372 + (0.5280718343*L1) + (-0.00002099671743*A3)
## Raw Score RMSE = 2.85253
## Post stratification was applied. The weights range from 1.565 to 1402.747 (m = 63.838, sd = 84.52).
##
## Use 'printSubset(model)' to get detailed information on the different solutions, 'plotPercentiles(model) to display percentile plot, plotSubset(model)' to inspect model fit.
summary(norm.model)
## cNORM Model Summary
## -------------------
## Number of terms: 2
## Adjusted R-squared: 0.9016
## RMSE: 2.853
## Selection strategy: 1, largest consistent model
## Highest consistent model: 2
## Raw score variable: raw
## Raw score range: 7 to 55
## Age range: 11 to 65
##
## Regression function:
## raw ~ -15.233033721911 + (0.52807183425577*L1) + (-0.0000209967174266698*A3)
## Final solution: 2 terms (highest consistent model)
## R-Square Adj. = 0.901607
## Final regression model: raw ~ L1 + A3
## Regression function: raw ~ -15.23303372 + (0.5280718343*L1) + (-0.00002099671743*A3)
## Raw Score RMSE = 2.85253
## Post stratification was applied. The weights range from 1.565 to 1402.747 (m = 63.838, sd = 84.52).
# get means and sds for calculating iqs
means_sds_and_ses_RPP <- sim_pop_sample_with_draws %>%
group_by(age) %>%
summarise(RPP_mean = mean(.prediction),
RPP_sd = sd(.prediction))
iqs_norming_sample <- norm.data %>%
select(age, raw_score = cft) %>%
left_join(means_sds_and_ses_RPP, by = "age") %>%
group_by(age) %>%
mutate(RPP_IQ_linear = iq(raw_score, RPP_mean, RPP_sd),
cNorm_IQ = predictNorm(raw = raw_score,
A = age,
model = norm.model),
dif = RPP_IQ_linear - cNorm_IQ,
absolute_dif = abs(RPP_IQ_linear - cNorm_IQ)) %>%
ungroup()
# overall_comparison
iqs_norming_sample %>%
summarise(mean_RPP_IQ = mean(RPP_IQ_linear),
mean_cNorm_IQ = mean(cNorm_IQ),
mean_dif = mean(dif),
max_dif = max(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2))) %>% kable()
mean_RPP_IQ | mean_cNorm_IQ | mean_dif | max_dif | mean_absolute_dif | RMS_dif |
---|---|---|---|---|---|
102.8 | 102.4 | 0.4028 | 12.31 | 2.712 | 4.258 |
iqs_random_sample <- sim_pop_sample_with_draws %>%
group_by(.row) %>%
slice(1) %>%
select(age, raw_score = .prediction) %>%
left_join(means_sds_and_ses_RPP, by = "age") %>%
group_by(age) %>%
mutate(RPP_IQ_linear = iq(raw_score, RPP_mean, RPP_sd),
cNorm_IQ = predictNorm(raw = raw_score,
A = age,
model = norm.model),
dif = RPP_IQ_linear - cNorm_IQ,
absolute_dif = abs(RPP_IQ_linear - cNorm_IQ)) %>%
ungroup()
# overall_comparison
iqs_random_sample %>%
summarise(mean_RPP_IQ = mean(RPP_IQ_linear),
mean_cNorm_IQ = mean(cNorm_IQ),
mean_dif = mean(dif),
max_dif = max(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2))) %>% kable()
mean_RPP_IQ | mean_cNorm_IQ | mean_dif | max_dif | mean_absolute_dif | RMS_dif |
---|---|---|---|---|---|
99.79 | 99.76 | 0.026 | 12.29 | 2.159 | 3.148 |
means_ns_sds_and_ses <- iqs_norming_sample %>%
group_by(age) %>%
summarise(RPP_mean = mean(RPP_IQ_linear),
RPP_sd = sd(RPP_IQ_linear),
cNorm_mean = mean(cNorm_IQ),
cNorm_sd = sd(cNorm_IQ)) %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 5))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean , ymax = mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = source), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c(
"#E18727FF",
"#0072B5FF"
)) +
labs(x = "Age", y = "IQ")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = sd, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 5))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = sd , ymax = sd),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = source), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c(
"#E18727FF",
"#0072B5FF"
)) +
labs(x = "Age", y = "SD")
# Put them in a named list
df_list <- list(
"Norming sample" = iqs_norming_sample,
"Simulated poststratified sample" = iqs_random_sample
)
# Create a single table with all results
combined <- df_list %>%
imap_dfr(~ {
# .x is the data frame, .y is the name in the list (e.g. "norming_sample")
# 1) No tail filtering:
no_filter <- .x %>%
summarise(
mean_dif = mean(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2))
) %>%
pivot_longer(
cols = everything(),
names_to = "value_type",
values_to = "value"
) %>%
mutate(
at_tails = FALSE,
source = .y
)
# 2) With tail filtering:
with_filter <- .x %>%
filter(between(RPP_IQ_linear, 65, 75) | between(RPP_IQ_linear, 125, 135)) %>%
summarise(
mean_dif = mean(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2))
) %>%
pivot_longer(
cols = everything(),
names_to = "value_type",
values_to = "value"
) %>%
mutate(
at_tails = TRUE,
source = .y
)
# Combine the no-filter and filtered results:
bind_rows(no_filter, with_filter)
})
combined <- combined %>%
mutate(
value_type = factor(
value_type,
levels = c("mean_dif", "mean_absolute_dif", "RMS_dif"),
labels = c("Mean difference",
"Mean absolute difference",
"Root mean squared difference")
)
)
# Create the plot
ggplot(combined, aes(x = source, y = value, fill = at_tails)) +
geom_col(position = position_dodge()) +
# Facet in the order set in the factor above
facet_wrap(~ value_type, scales = "free_y") +
labs(
x = "Source",
y = "Value"
) +
# Use manual color scale with the requested colors
scale_fill_manual(
name = "At tails",
values = c("FALSE" = "#24ff24", "TRUE" = "#490092")
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold") # emphasize facet labels
) + scale_y_continuous(limits = c(-1, 6))
cnorm_percentiles <- (plotPercentiles(norm.model, percentiles = c(0.01, 0.05, 0.5, 0.95, 0.99)))$data %>%
filter(complete.cases(value) & type == "Predicted") %>%
rename(age = group,
Raw_score = value) %>%
mutate(Source = "cNorm + R",
age = round(age),
Raw_score = round(Raw_score),
percentile_value = case_when(
percentile == "PR1" ~ 0.01,
percentile == "PR5" ~ 0.05,
percentile == "PR50" ~ 0.50,
percentile == "PR95" ~ 0.95,
percentile == "PR99" ~ 0.99,
TRUE ~ NA_real_ # Handle any unexpected values
),
percentile_label = case_when(
percentile == "PR1" ~ "1st percentile",
percentile == "PR5" ~ "5th percentile",
percentile == "PR50" ~ "50th percentile",
percentile == "PR95" ~ "95th percentile",
percentile == "PR99" ~ "99th percentile",
TRUE ~ NA_character_ # Handle any unexpected values
)
) %>%
select(-type, -percentile)
RPP_percentiles <- RPP_vs_raw$percentile_plot$data
combined_percentiles <- RPP_percentiles %>% bind_rows(cnorm_percentiles)
color_values <- c("RPP" = "#0072B5FF", "cNorm + R" = "#E18727FF")
combined_percentiles$Source <- factor(combined_percentiles$Source, levels = c("RPP", "cNorm + R"))
baseMaxLabelLen <- max(nchar(combined_percentiles$line_label), na.rm = TRUE)
myExpandRight2 <- baseMaxLabelLen / 1.7
max_age_val <- max(combined_percentiles$age, na.rm = TRUE)
ggplot(combined_percentiles, aes(x = age, y = Raw_score,
color = Source,
group = interaction(Source, percentile_label))) +
geom_step(linewidth = 1, linetype = 1) +
scale_x_continuous(
breaks = seq(min(combined_percentiles$age), max_age_val, by = 2),
expand = expansion(add = c(1, myExpandRight2))
) +
scale_color_manual(values = color_values, name = "") +
geom_text(
data = combined_percentiles %>% filter(age == max_age_val, show_text, !is.na(line_label)),
aes(label = line_label),
nudge_x = 1,
hjust = 0,
family = "Times"
) +
labs(x = "Age", y = "Test score", color = "") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", linewidth = 0.3),
legend.position = "top"
)
ggsave("figures/08_RPP_vs_cNorm.jpeg", width = 8, height = 4)
Most of the code in this section is from de Vries et al.’s (2025) supplementary materials.
Assuming no interactions and homoscedasticity, like in de Vries et al. (2025).
source("BB_free_order_functions.R")
tl_select <- select(tl, cft, age, educ, mig, male) %>% mutate_at(vars(-cft, -age), ~as.factor(.))
fbselectBB(mydata = tl_select,
score = tl_select$cft,
age = tl_select$age,
max_score=56,
re_structure = "+random(educ)+random(mig)+random(male)")
##
## Family: c("BB", "Beta Binomial")
## Fitting method: RS(1000)
##
## Call: gamlss::gamlss(formula = Score ~ poly(X, 5) + random(educ) +
## random(mig) + random(male), sigma.formula = ~1, family = "BB", data = mydata, method = RS(1000),
## pb = max_score, trace = FALSE)
##
## Mu Coefficients:
## (Intercept) poly(X, 5)1 poly(X, 5)2 poly(X, 5)3 poly(X, 5)4 poly(X, 5)5 random(educ) random(mig)
## 0.716 2.318 -16.190 7.415 -7.483 3.562 NA NA
## random(male)
## NA
## Sigma Coefficients:
## (Intercept)
## -2.93
##
## Degrees of Freedom for the fit: 29.7 Residual Deg. of Freedom 9950
## Global Deviance: 66025
## AIC: 66085
## SBC: 66299
# Fit the selected model
max_score = 56 # maximum test score
tl_select$Score <- cbind(tl_select$cft, max_score - tl_select$cft) # reformat the scores to suit the BB distribution
GAMLSS_MRP_model <-gamlss(Score ~ poly(age, 5) + random(educ) + random(mig) + random(male), sigma.formula = ~1, family = "BB", data = tl_select, method = RS(1000), pb = max_score)
## GAMLSS-RS iteration 1: Global Deviance = 70932
## GAMLSS-RS iteration 2: Global Deviance = 66077
## GAMLSS-RS iteration 3: Global Deviance = 66025
## GAMLSS-RS iteration 4: Global Deviance = 66025
## GAMLSS-RS iteration 5: Global Deviance = 66025
## GAMLSS-RS iteration 6: Global Deviance = 66025
GAMLSS_MRP_model_spline_sigma <- gamlss(
formula = Score ~ random(male) +
random(mig) +
random(mig:male) +
random(mig:educ) +
pb(age, by = educ),
sigma.formula = ~
random(male) +
random(mig) +
random(educ) +
pb(age),
family = "BB",
method = RS(1000),
data = tl_select
)
## GAMLSS-RS iteration 1: Global Deviance = 70377
## GAMLSS-RS iteration 2: Global Deviance = 65954
## GAMLSS-RS iteration 3: Global Deviance = 65789
## GAMLSS-RS iteration 4: Global Deviance = 65788
## GAMLSS-RS iteration 5: Global Deviance = 65788
## GAMLSS-RS iteration 6: Global Deviance = 65789
## GAMLSS-RS iteration 7: Global Deviance = 65789
## GAMLSS-RS iteration 8: Global Deviance = 65789
## GAMLSS-RS iteration 9: Global Deviance = 65789
## GAMLSS-RS iteration 10: Global Deviance = 65789
## GAMLSS-RS iteration 11: Global Deviance = 65789
## GAMLSS-RS iteration 12: Global Deviance = 65789
## GAMLSS-RS iteration 13: Global Deviance = 65790
## GAMLSS-RS iteration 14: Global Deviance = 65790
## GAMLSS-RS iteration 15: Global Deviance = 65790
## GAMLSS-RS iteration 16: Global Deviance = 65790
## GAMLSS-RS iteration 17: Global Deviance = 65790
## GAMLSS-RS iteration 18: Global Deviance = 65790
## GAMLSS-RS iteration 19: Global Deviance = 65790
## GAMLSS-RS iteration 20: Global Deviance = 65790
## GAMLSS-RS iteration 21: Global Deviance = 65790
## GAMLSS-RS iteration 22: Global Deviance = 65790
## GAMLSS-RS iteration 23: Global Deviance = 65790
## GAMLSS-RS iteration 24: Global Deviance = 65790
term.plot(
object = GAMLSS_MRP_model_spline_sigma, # your fitted gamlss() model
what = "mu", # or "sigma" if you want the sigma part
# partial = TRUE, # plot partial effects
# rug = TRUE, # show rug of observed data
ask = FALSE, # to avoid prompting between plots
data = tl_select # the data set
)
BIC(GAMLSS_MRP_model_spline_sigma, GAMLSS_MRP_model)
## df BIC
## GAMLSS_MRP_model_spline_sigma 110.5 66807
## GAMLSS_MRP_model 29.7 66299
AIC(GAMLSS_MRP_model_spline_sigma, GAMLSS_MRP_model)
## df AIC
## GAMLSS_MRP_model_spline_sigma 110.5 66011
## GAMLSS_MRP_model 29.7 66085
Conflicting conclusions from BIC and AIC.
ages <- seq(11,65,1) # ages for which to predict the test score distribution
pop_age <- expand.grid(age = ages, educ = levels(tl_select$educ), mig = levels(tl_select$mig), male = levels(tl_select$male)) # dataframe with all ages
est.values.mrp <- predictAll(GAMLSS_MRP_model, newdata = pop_age, type = "response") # Predict distributional parameters all age values values and all adjustment variables
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
population_data <- census %>% group_by(male, educ, mig) %>%
summarise(census_n = sum(census_n)) %>%
ungroup() %>%
mutate(Proportion = census_n/sum(census_n)) %>%
mutate(male = as.factor(male)) %>%
select(-census_n)
pop_age <- left_join(pop_age, population_data, by = c("educ", "mig", "male")) # Append the predictions with population percentages of the adjustment variables from population data
Min_score <- 0 # Minimum (adjusted) raw test score for which to calculate percentiles
Max_score <- 56 # Maximum (adjusted) raw test score for which to calculate percentiles
step_size <- 1
CDF_matrix_mrp <- matrix(NA, ncol = (Max_score - Min_score)/step_size + 1, nrow = nrow(pop_age)) # Create empty matrix to store the percentiles; columns = raw test scores & rows = age value & adjustment variable combinations
for (i in 1:nrow(pop_age)){ # for each score, age and adjustment variable combination, estimate the corresponding percentile. Multiply by population proportion of adjustment variable (will marginalize out later)
CDF_matrix_mrp[i,] <- pBB(seq(from = Min_score, to = Max_score, by = step_size), mu = est.values.mrp$mu[i], sigma = est.values.mrp$sigma[i], bd = max_score)*pop_age$Proportion[i] # lower.tail = FALSE when a higher test score must correspond to a lower rather than higher percentile.
}
CDF_matrix_mrp <- aggregate(CDF_matrix_mrp, list(age = pop_age$age), sum) # Marginalize out the adjustment variables: sum over the previous weighted percentiles to obtain the MRP estimate for the CDF.
for (i in (2:58)){ # Change column names to correspond to the test scores, not 'V1', 'V2' (V1 corresponded to the minimum score of 0, V32 to the maximum score of 31.)
names(CDF_matrix_mrp)[i] <- i-2
}
# Continuity correction, because the discrete percentiles of the BB distribution are converted to a continuous scale (Z-scores)
CDF_matrix_mrp_cc <- CDF_matrix_mrp # define a new matrix CDF_matrix_mrp_cc containing continuity corrected percentiles.
CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc)] <- 0.5*CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc)]+0.5*cbind(rep(0,nrow(CDF_matrix_mrp_cc)),CDF_matrix_mrp_cc[,2:(ncol(CDF_matrix_mrp_cc)-1)])
# Trim outer percentiles to avoid errors in conversion to Z-scores (i.e., NaNs in qnorm).
CDF_matrix_mrp[,-1][which(CDF_matrix_mrp[,-1] < 0.0001, arr.ind = TRUE)] <- 0.0001
CDF_matrix_mrp[,-1][which(CDF_matrix_mrp[,-1] > 0.9999, arr.ind = TRUE)] <- 0.9999
# Transform estimated percentiles to estimated normalized z-scores
z_matrix <- qnorm(as.matrix(CDF_matrix_mrp[,-1])) # [,-1]: remove the first column: Age.
# Restrict the normalized z-scores to the range [-3; +3]
z_matrix <- apply(z_matrix, 1, function(x) {ifelse(x > 3, 3, x)})
z_matrix <- apply(z_matrix, 1, function(x) {ifelse(x < -3, -3, x)})
z_matrix <- cbind(CDF_matrix_mrp[1],z_matrix) # add age again
pop_age <- expand.grid(age = ages, educ = levels(tl_select$educ), mig = levels(tl_select$mig), male = levels(tl_select$male)) # dataframe with all ages
est.values.mrp2 <- predictAll(GAMLSS_MRP_model_spline_sigma, newdata = pop_age, type = "response")
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
# the model fails to predict a mu for some subcategories, so we remove them
valid <- !is.na(est.values.mrp2$mu)
pop_age <- pop_age[valid, ]
est.values.mrp2 <- lapply(est.values.mrp2, function(x) x[valid])
pop_age <- left_join(pop_age, population_data, by = c("educ", "mig", "male")) # Append the predictions with population percentages of the adjustment variables from population data
CDF_matrix_mrp2 <- matrix(NA, ncol = (Max_score - Min_score)/step_size + 1, nrow = nrow(pop_age)) # Create empty matrix to store the percentiles; columns = raw test scores & rows = age value & adjustment variable combinations
for (i in 1:nrow(pop_age)){ # for each score, age and adjustment variable combination, estimate the corresponding percentile. Multiply by population proportion of adjustment variable (will marginalize out later)
CDF_matrix_mrp2[i,] <- pBB(seq(from = Min_score, to = Max_score, by = step_size), mu = est.values.mrp2$mu[i], sigma = est.values.mrp2$sigma[i], bd = max_score)*pop_age$Proportion[i] # lower.tail = FALSE when a higher test score must correspond to a lower rather than higher percentile.
}
CDF_matrix_mrp2 <- aggregate(CDF_matrix_mrp2, list(age = pop_age$age), sum) # Marginalize out the adjustment variables: sum over the previous weighted percentiles to obtain the MRP estimate for the CDF.
for (i in (2:58)){ # Change column names to correspond to the test scores, not 'V1', 'V2' (V1 corresponded to the minimum score of 0, V32 to the maximum score of 31.)
names(CDF_matrix_mrp2)[i] <- i-2
}
# Continuity correction, because the discrete percentiles of the BB distribution are converted to a continuous scale (Z-scores)
CDF_matrix_mrp2_cc <- CDF_matrix_mrp2 # define a new matrix CDF_matrix_mrp2_cc containing continuity corrected percentiles.
CDF_matrix_mrp2_cc[,2:ncol(CDF_matrix_mrp2_cc)] <- 0.5*CDF_matrix_mrp2_cc[,2:ncol(CDF_matrix_mrp2_cc)]+0.5*cbind(rep(0,nrow(CDF_matrix_mrp2_cc)),CDF_matrix_mrp2_cc[,2:(ncol(CDF_matrix_mrp2_cc)-1)])
# Trim outer percentiles to avoid errors in conversion to Z-scores (i.e., NaNs in qnorm).
CDF_matrix_mrp2[,-1][which(CDF_matrix_mrp2[,-1] < 0.0001, arr.ind = TRUE)] <- 0.0001
CDF_matrix_mrp2[,-1][which(CDF_matrix_mrp2[,-1] > 0.9999, arr.ind = TRUE)] <- 0.9999
# Transform estimated percentiles to estimated normalized z-scores
z_matrix2 <- qnorm(as.matrix(CDF_matrix_mrp2[,-1])) # [,-1]: remove the first column: Age.
# Restrict the normalized z-scores to the range [-3; +3]
z_matrix2 <- apply(z_matrix2, 1, function(x) {ifelse(x > 3, 3, x)})
z_matrix2 <- apply(z_matrix2, 1, function(x) {ifelse(x < -3, -3, x)})
z_matrix2 <- cbind(CDF_matrix_mrp2[1],z_matrix2) # add age again
cdf_df <- as.data.frame(CDF_matrix_mrp)
names(cdf_df)[-1] <- as.integer(names(cdf_df)[-1]) # convert "0", "1", ..., "56" to numeric
cdf_df2 <- as.data.frame(CDF_matrix_mrp2)
names(cdf_df2)[-1] <- as.integer(names(cdf_df2)[-1]) # convert "0", "1", ..., "56" to numeric
cdf_long <- cdf_df %>%
pivot_longer(-age, names_to = "Raw_score", values_to = "percentile") %>%
mutate(Raw_score = as.numeric(Raw_score))
cdf_long2 <- cdf_df2 %>%
pivot_longer(-age, names_to = "Raw_score", values_to = "percentile") %>%
mutate(Raw_score = as.numeric(Raw_score))
# Interpolate raw scores at e6ach target percentile per age
target_p <- c(0.01, 0.05, 0.5, 0.95, 0.99)
GAMLSS_MRP_percentiles <- cdf_long %>%
group_by(age) %>%
arrange(percentile, .by_group = TRUE) %>%
reframe(
p = target_p,
Raw_score = round(approx(x = percentile, y = Raw_score, xout = target_p, ties = "ordered")$y),
.groups = "drop"
) %>%
mutate(
Source = "Simple GAMLSS + MRP",
percentile_label = case_when(
abs(p - 0.01) < 1e-8 ~ "1st percentile",
abs(p - 0.05) < 1e-8 ~ "5th percentile",
abs(p - 0.5) < 1e-8 ~ "50th percentile",
abs(p - 0.95) < 1e-8 ~ "95th percentile",
abs(p - 0.99) < 1e-8 ~ "99th percentile"
),
show_text = FALSE,
line_label = percentile_label
) %>%
select(age, Raw_score, Source, percentile_label, show_text, line_label)
GAMLSS_MRP_percentiles2 <- cdf_long2 %>%
group_by(age) %>%
arrange(percentile, .by_group = TRUE) %>%
reframe(
p = target_p,
Raw_score = round(approx(x = percentile, y = Raw_score, xout = target_p, ties = "ordered")$y),
.groups = "drop"
) %>%
mutate(
Source = "Complex GAMLSS + MRP",
percentile_label = case_when(
abs(p - 0.01) < 1e-8 ~ "1st percentile",
abs(p - 0.05) < 1e-8 ~ "5th percentile",
abs(p - 0.5) < 1e-8 ~ "50th percentile",
abs(p - 0.95) < 1e-8 ~ "95th percentile",
abs(p - 0.99) < 1e-8 ~ "99th percentile"
),
show_text = FALSE,
line_label = percentile_label
) %>%
select(age, Raw_score, Source, percentile_label, show_text, line_label)
combined_percentiles <- RPP_percentiles %>% bind_rows(GAMLSS_MRP_percentiles) %>% bind_rows(GAMLSS_MRP_percentiles2)
combined_percentiles$Source <- factor(combined_percentiles$Source, levels = c("RPP", "Simple GAMLSS + MRP", "Complex GAMLSS + MRP"))
color_values <- c("RPP" = "#0072B5FF", "Simple GAMLSS + MRP" = "#E18727FF", "Complex GAMLSS + MRP" = "#FFDC91FF")
ggplot(combined_percentiles, aes(x = age, y = Raw_score,
color = Source,
group = interaction(Source, percentile_label))) +
geom_step(linewidth = 1, linetype = 1) +
scale_x_continuous(
breaks = seq(min(combined_percentiles$age), 65, by = 2),
expand = expansion(add = c(1, 8))
) +
scale_color_manual(values = color_values, name = "") +
geom_text(
data = combined_percentiles %>% filter(age == 65, show_text, !is.na(line_label)),
aes(label = line_label),
nudge_x = 1,
hjust = 0,
family = "Times"
) +
labs(x = "Age", y = "Test score", color = "") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", linewidth = 0.3),
legend.position = "top"
)
ggsave("figures/09_RPP_vs_GAMLSS_MRP.jpeg", width = 8, height = 4)
z_df <- as.data.frame(z_matrix)
names(z_df)[-1] <- as.integer(names(z_df)[-1])
z_df2 <- as.data.frame(z_matrix2)
names(z_df2)[-1] <- as.integer(names(z_df2)[-1])
z_long <- z_df %>%
pivot_longer(-age, names_to = "raw_score", values_to = "z") %>%
mutate(
raw_score = as.integer(raw_score),
GAMLSS_MRP_IQ = round(100 + 15 * z, 1) # Convert Z to IQ
) %>%
select(age, raw_score, GAMLSS_MRP_IQ)
z_long2 <- z_df2 %>%
pivot_longer(-age, names_to = "raw_score", values_to = "z") %>%
mutate(
raw_score = as.integer(raw_score),
GAMLSS_s_MRP_IQ = round(100 + 15 * z, 1) # Convert Z to IQ
) %>%
select(age, raw_score, GAMLSS_s_MRP_IQ)
iqs_norming_sample <- tl %>%
select(age = age0100, raw_score = cft) %>%
left_join(means_sds_and_ses_RPP, by = "age") %>%
left_join(z_long, by = c("age", "raw_score")) %>%
left_join(z_long2, by = c("age", "raw_score")) %>%
group_by(age) %>%
mutate(
RPP_IQ_linear = iq(raw_score, RPP_mean, RPP_sd),
dif = RPP_IQ_linear - GAMLSS_MRP_IQ,
absolute_dif = abs(dif)
) %>% ungroup()
# overall_comparison to BB selected model
iqs_norming_sample %>%
summarise(mean_RPP_IQ = mean(RPP_IQ_linear),
mean_GAMLSS_IQ = mean(GAMLSS_MRP_IQ),
mean_dif = mean(dif),
max_dif = max(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2))) %>% kable()
mean_RPP_IQ | mean_GAMLSS_IQ | mean_dif | max_dif | mean_absolute_dif | RMS_dif |
---|---|---|---|---|---|
102.8 | 104.6 | -1.826 | 1.411 | 2.078 | 3.148 |
means_ns_sds_and_ses <- iqs_norming_sample %>%
group_by(age) %>%
summarise(RPP_mean = mean(RPP_IQ_linear),
RPP_sd = sd(RPP_IQ_linear),
"Simple GAMLSS + MRP_mean" = mean(GAMLSS_MRP_IQ),
"Complex GAMLSS + MRP_mean" = mean(GAMLSS_s_MRP_IQ)) %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 17))) +
scale_color_manual(values = color_values, name = "") +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean , ymax = mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = source), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
)+
labs(x = "Age", y = "IQ")
Spoiler: yes.
ages <- seq(11,65,1) # ages for which to predict the test score distribution
pop_age <- expand.grid(age = ages, educ = levels(tl_select$educ), mig = levels(tl_select$mig), male = levels(tl_select$male)) # dataframe with all ages
est.values.mrp <- predictAll(GAMLSS_MRP_model_spline_sigma, newdata = pop_age, type = "response") # Predict distributional parameters all age values values and all adjustment variables
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
valid <- !is.na(est.values.mrp$mu)
pop_age <- pop_age[valid, ]
est.values.mrp <- lapply(est.values.mrp, function(x) x[valid])
# Include age in the population data grouping
population_data <- census %>% group_by(age, male, educ, mig) %>%
summarise(census_n = sum(census_n)) %>%
ungroup() %>%
group_by(age) %>%
mutate(Proportion = census_n/sum(census_n)) %>%
ungroup() %>%
mutate(male = as.factor(male)) %>%
select(-census_n)
pop_age <- left_join(pop_age, population_data, by = c("age", "educ", "mig", "male")) %>%
mutate(Proportion = if_else(is.na(Proportion), 0, Proportion)) # Append the predictions with population percentages
Min_score <- 0 # Minimum (adjusted) raw test score for which to calculate percentiles
Max_score <- 56 # Maximum (adjusted) raw test score for which to calculate percentiles
step_size <- 1
CDF_matrix_mrp <- matrix(NA, ncol = (Max_score - Min_score)/step_size + 1, nrow = nrow(pop_age))
for (i in 1:nrow(pop_age)){
CDF_matrix_mrp[i,] <- pBB(seq(from = Min_score, to = Max_score, by = step_size), mu = est.values.mrp$mu[i], sigma = est.values.mrp$sigma[i], bd = max_score)*pop_age$Proportion[i]
}
# Since we already have age-specific population weights, we can directly aggregate by age
CDF_matrix_mrp <- aggregate(CDF_matrix_mrp, list(age = pop_age$age), sum)
for (i in (2:58)){
names(CDF_matrix_mrp)[i] <- i-2
}
# Continuity correction
CDF_matrix_mrp_cc <- CDF_matrix_mrp
CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc)] <- 0.5*CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc)]+0.5*cbind(rep(0,nrow(CDF_matrix_mrp_cc)),CDF_matrix_mrp_cc[,2:(ncol(CDF_matrix_mrp_cc)-1)])
# Trim outer percentiles
CDF_matrix_mrp[,-1][which(CDF_matrix_mrp[,-1] < 0.0001, arr.ind = TRUE)] <- 0.0001
CDF_matrix_mrp[,-1][which(CDF_matrix_mrp[,-1] > 0.9999, arr.ind = TRUE)] <- 0.9999
# Transform estimated percentiles to estimated normalized z-scores
z_matrix <- qnorm(as.matrix(CDF_matrix_mrp[,-1]))
# Restrict the normalized z-scores to the range [-3; +3]
z_matrix <- apply(z_matrix, 1, function(x) {ifelse(x > 3, 3, x)})
z_matrix <- apply(z_matrix, 1, function(x) {ifelse(x < -3, -3, x)})
z_matrix <- cbind(CDF_matrix_mrp[1],z_matrix) # add age again
pop_age <- expand.grid(age = ages, educ = levels(tl_select$educ), mig = levels(tl_select$mig), male = levels(tl_select$male)) # dataframe with all ages
est.values.mrp2 <- predictAll(GAMLSS_MRP_model_spline_sigma, newdata = pop_age, type = "response")
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
population_data <- census %>% group_by(male, educ, mig) %>%
summarise(census_n = sum(census_n)) %>%
ungroup() %>%
mutate(Proportion = census_n/sum(census_n)) %>%
mutate(male = as.factor(male)) %>%
select(-census_n)
# the model fails to predict a mu for some subcategories, so we remove them
valid <- !is.na(est.values.mrp2$mu)
pop_age <- pop_age[valid, ]
est.values.mrp2 <- lapply(est.values.mrp2, function(x) x[valid])
pop_age <- left_join(pop_age, population_data, by = c("educ", "mig", "male")) # Append the predictions with population percentages of the adjustment variables from population data
CDF_matrix_mrp2 <- matrix(NA, ncol = (Max_score - Min_score)/step_size + 1, nrow = nrow(pop_age)) # Create empty matrix to store the percentiles; columns = raw test scores & rows = age value & adjustment variable combinations
for (i in 1:nrow(pop_age)){ # for each score, age and adjustment variable combination, estimate the corresponding percentile. Multiply by population proportion of adjustment variable (will marginalize out later)
CDF_matrix_mrp2[i,] <- pBB(seq(from = Min_score, to = Max_score, by = step_size), mu = est.values.mrp2$mu[i], sigma = est.values.mrp2$sigma[i], bd = max_score)*pop_age$Proportion[i] # lower.tail = FALSE when a higher test score must correspond to a lower rather than higher percentile.
}
CDF_matrix_mrp2 <- aggregate(CDF_matrix_mrp2, list(age = pop_age$age), sum) # Marginalize out the adjustment variables: sum over the previous weighted percentiles to obtain the MRP estimate for the CDF.
for (i in (2:58)){ # Change column names to correspond to the test scores, not 'V1', 'V2' (V1 corresponded to the minimum score of 0, V32 to the maximum score of 31.)
names(CDF_matrix_mrp2)[i] <- i-2
}
# Continuity correction, because the discrete percentiles of the BB distribution are converted to a continuous scale (Z-scores)
CDF_matrix_mrp2_cc <- CDF_matrix_mrp2 # define a new matrix CDF_matrix_mrp2_cc containing continuity corrected percentiles.
CDF_matrix_mrp2_cc[,2:ncol(CDF_matrix_mrp2_cc)] <- 0.5*CDF_matrix_mrp2_cc[,2:ncol(CDF_matrix_mrp2_cc)]+0.5*cbind(rep(0,nrow(CDF_matrix_mrp2_cc)),CDF_matrix_mrp2_cc[,2:(ncol(CDF_matrix_mrp2_cc)-1)])
# Trim outer percentiles to avoid errors in conversion to Z-scores (i.e., NaNs in qnorm).
CDF_matrix_mrp2[,-1][which(CDF_matrix_mrp2[,-1] < 0.0001, arr.ind = TRUE)] <- 0.0001
CDF_matrix_mrp2[,-1][which(CDF_matrix_mrp2[,-1] > 0.9999, arr.ind = TRUE)] <- 0.9999
# Transform estimated percentiles to estimated normalized z-scores
z_matrix2 <- qnorm(as.matrix(CDF_matrix_mrp2[,-1])) # [,-1]: remove the first column: Age.
# Restrict the normalized z-scores to the range [-3; +3]
z_matrix2 <- apply(z_matrix2, 1, function(x) {ifelse(x > 3, 3, x)})
z_matrix2 <- apply(z_matrix2, 1, function(x) {ifelse(x < -3, -3, x)})
z_matrix2 <- cbind(CDF_matrix_mrp2[1],z_matrix2) # add age again
cdf_df <- as.data.frame(CDF_matrix_mrp)
names(cdf_df)[-1] <- as.integer(names(cdf_df)[-1]) # convert "0", "1", ..., "56" to numeric
cdf_df2 <- as.data.frame(CDF_matrix_mrp2)
names(cdf_df2)[-1] <- as.integer(names(cdf_df2)[-1]) # convert "0", "1", ..., "56" to numeric
cdf_long <- cdf_df %>%
pivot_longer(-age, names_to = "Raw_score", values_to = "percentile") %>%
mutate(Raw_score = as.numeric(Raw_score))
cdf_long2 <- cdf_df2 %>%
pivot_longer(-age, names_to = "Raw_score", values_to = "percentile") %>%
mutate(Raw_score = as.numeric(Raw_score))
# Interpolate raw scores at e6ach target percentile per age
target_p <- c(0.01, 0.05, 0.5, 0.95, 0.99)
GAMLSS_MRP_percentiles <- cdf_long %>%
group_by(age) %>%
arrange(percentile, .by_group = TRUE) %>%
reframe(
p = target_p,
Raw_score = round(approx(x = percentile, y = Raw_score, xout = target_p, ties = "ordered")$y),
.groups = "drop"
) %>%
mutate(
Source = "PS with age",
percentile_label = case_when(
abs(p - 0.01) < 1e-8 ~ "1st percentile",
abs(p - 0.05) < 1e-8 ~ "5th percentile",
abs(p - 0.5) < 1e-8 ~ "50th percentile",
abs(p - 0.95) < 1e-8 ~ "95th percentile",
abs(p - 0.99) < 1e-8 ~ "99th percentile"
),
show_text = FALSE,
line_label = percentile_label
) %>%
select(age, Raw_score, Source, percentile_label, show_text, line_label)
GAMLSS_MRP_percentiles2 <- cdf_long2 %>%
group_by(age) %>%
arrange(percentile, .by_group = TRUE) %>%
reframe(
p = target_p,
Raw_score = round(approx(x = percentile, y = Raw_score, xout = target_p, ties = "ordered")$y),
.groups = "drop"
) %>%
mutate(
Source = "PS without age",
percentile_label = case_when(
abs(p - 0.01) < 1e-8 ~ "1st percentile",
abs(p - 0.05) < 1e-8 ~ "5th percentile",
abs(p - 0.5) < 1e-8 ~ "50th percentile",
abs(p - 0.95) < 1e-8 ~ "95th percentile",
abs(p - 0.99) < 1e-8 ~ "99th percentile"
),
show_text = FALSE,
line_label = percentile_label
) %>%
select(age, Raw_score, Source, percentile_label, show_text, line_label)
combined_percentiles <- RPP_percentiles %>% bind_rows(GAMLSS_MRP_percentiles) %>% bind_rows(GAMLSS_MRP_percentiles2)
combined_percentiles$Source <- factor(combined_percentiles$Source, levels = c("RPP", "PS with age", "PS without age"))
color_values <- c("RPP" = "#0072B5FF", "PS with age" = "#E18727FF", "PS without age" = "#FFDC91FF")
ggplot(combined_percentiles, aes(x = age, y = Raw_score,
color = Source,
group = interaction(Source, percentile_label))) +
geom_step(linewidth = 1, linetype = 1) +
scale_x_continuous(
breaks = seq(min(combined_percentiles$age), 65, by = 2),
expand = expansion(add = c(1, 8))
) +
scale_color_manual(values = color_values, name = "") +
geom_text(
data = combined_percentiles %>% filter(age == 65, show_text, !is.na(line_label)),
aes(label = line_label),
nudge_x = 1,
hjust = 0,
family = "Times"
) +
labs(x = "Age", y = "Test score", color = "") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", linewidth = 0.3),
legend.position = "top"
)
ggsave("figures/S08_RPP_vs_GAMLSS_MRP_PS.jpeg", width = 8, height = 4)
z_df <- as.data.frame(z_matrix)
names(z_df)[-1] <- as.integer(names(z_df)[-1])
z_df2 <- as.data.frame(z_matrix2)
names(z_df2)[-1] <- as.integer(names(z_df2)[-1])
z_long <- z_df %>%
pivot_longer(-age, names_to = "raw_score", values_to = "z") %>%
mutate(
raw_score = as.integer(raw_score),
GAMLSS_MRP_IQ = round(100 + 15 * z, 1) # Convert Z to IQ
) %>%
select(age, raw_score, GAMLSS_MRP_IQ)
z_long2 <- z_df2 %>%
pivot_longer(-age, names_to = "raw_score", values_to = "z") %>%
mutate(
raw_score = as.integer(raw_score),
GAMLSS_s_MRP_IQ = round(100 + 15 * z, 1) # Convert Z to IQ
) %>%
select(age, raw_score, GAMLSS_s_MRP_IQ)
iqs_norming_sample <- tl %>%
select(age = age0100, raw_score = cft) %>%
left_join(means_sds_and_ses_RPP, by = "age") %>%
left_join(z_long, by = c("age", "raw_score")) %>%
left_join(z_long2, by = c("age", "raw_score")) %>%
group_by(age) %>%
mutate(
RPP_IQ_linear = iq(raw_score, RPP_mean, RPP_sd),
dif = RPP_IQ_linear - GAMLSS_MRP_IQ,
absolute_dif = abs(dif)
) %>% ungroup()
# overall_comparison to BB selected model
iqs_norming_sample %>%
summarise(mean_RPP_IQ = mean(RPP_IQ_linear),
mean_GAMLSS_IQ = mean(GAMLSS_MRP_IQ),
mean_dif = mean(dif),
max_dif = max(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2)))
## # A tibble: 1 × 6
## mean_RPP_IQ mean_GAMLSS_IQ mean_dif max_dif mean_absolute_dif RMS_dif
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 103. 104. -1.17 1.54 1.44 2.37
means_ns_sds_and_ses <- iqs_norming_sample %>%
group_by(age) %>%
summarise(RPP_mean = mean(RPP_IQ_linear),
RPP_sd = sd(RPP_IQ_linear),
"PS with age_mean" = mean(GAMLSS_MRP_IQ),
"PS without age_mean" = mean(GAMLSS_s_MRP_IQ)) %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 1))) +
scale_color_manual(values = color_values, name = "") +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean , ymax = mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 43 & source == "RPP"),
aes(label = source), family = "Times", seed = 810, nudge_y = -1, point.padding = 0, direction = "y") +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 30 & source == "PS with age"),
aes(label = source), family = "Times", seed = 810, nudge_x = 9, point.padding = 1, direction = "y") +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 29 & source == "PS without age"),
aes(label = source), family = "Times", seed = 810, nudge_y = 5, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
)+
labs(x = "Age", y = "IQ")
Spoiler: no
GAMLSS_MRP_model_spline_sigma2 <- gamlss(
formula = Score ~ random(male) +
random(mig) +
random(mig:male) +
random(mig:educ) +
random(educ) + # also tried educ as fixed effect, very similar results
pb(age, by = educ),
sigma.formula = ~
random(male) +
random(mig) +
random(educ) +
pb(age),
family = "BB",
method = RS(1000),
data = tl_select
)
## GAMLSS-RS iteration 1: Global Deviance = 70377
## GAMLSS-RS iteration 2: Global Deviance = 65954
## GAMLSS-RS iteration 3: Global Deviance = 65789
## GAMLSS-RS iteration 4: Global Deviance = 65788
## GAMLSS-RS iteration 5: Global Deviance = 65788
## GAMLSS-RS iteration 6: Global Deviance = 65789
## GAMLSS-RS iteration 7: Global Deviance = 65789
## GAMLSS-RS iteration 8: Global Deviance = 65789
## GAMLSS-RS iteration 9: Global Deviance = 65789
## GAMLSS-RS iteration 10: Global Deviance = 65789
## GAMLSS-RS iteration 11: Global Deviance = 65789
## GAMLSS-RS iteration 12: Global Deviance = 65789
## GAMLSS-RS iteration 13: Global Deviance = 65790
## GAMLSS-RS iteration 14: Global Deviance = 65790
## GAMLSS-RS iteration 15: Global Deviance = 65790
## GAMLSS-RS iteration 16: Global Deviance = 65790
## GAMLSS-RS iteration 17: Global Deviance = 65790
## GAMLSS-RS iteration 18: Global Deviance = 65790
## GAMLSS-RS iteration 19: Global Deviance = 65790
## GAMLSS-RS iteration 20: Global Deviance = 65790
## GAMLSS-RS iteration 21: Global Deviance = 65790
## GAMLSS-RS iteration 22: Global Deviance = 65790
## GAMLSS-RS iteration 23: Global Deviance = 65790
## GAMLSS-RS iteration 24: Global Deviance = 65790
ages <- seq(11,65,1) # ages for which to predict the test score distribution
pop_age <- expand.grid(age = ages, educ = levels(tl_select$educ), mig = levels(tl_select$mig), male = levels(tl_select$male)) # dataframe with all ages
est.values.mrp <- predictAll(GAMLSS_MRP_model_spline_sigma, newdata = pop_age, type = "response") # Predict distributional parameters all age values values and all adjustment variables
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
valid <- !is.na(est.values.mrp$mu)
pop_age <- pop_age[valid, ]
est.values.mrp <- lapply(est.values.mrp, function(x) x[valid])
# Include age in the population data grouping
population_data <- census %>% group_by(age, male, educ, mig) %>%
summarise(census_n = sum(census_n)) %>%
ungroup() %>%
group_by(age) %>%
mutate(Proportion = census_n/sum(census_n)) %>%
ungroup() %>%
mutate(male = as.factor(male)) %>%
select(-census_n)
pop_age <- left_join(pop_age, population_data, by = c("age", "educ", "mig", "male")) %>%
mutate(Proportion = if_else(is.na(Proportion), 0, Proportion)) # Append the predictions with population percentages
Min_score <- 0 # Minimum (adjusted) raw test score for which to calculate percentiles
Max_score <- 56 # Maximum (adjusted) raw test score for which to calculate percentiles
step_size <- 1
CDF_matrix_mrp <- matrix(NA, ncol = (Max_score - Min_score)/step_size + 1, nrow = nrow(pop_age))
for (i in 1:nrow(pop_age)){
CDF_matrix_mrp[i,] <- pBB(seq(from = Min_score, to = Max_score, by = step_size), mu = est.values.mrp$mu[i], sigma = est.values.mrp$sigma[i], bd = max_score)*pop_age$Proportion[i]
}
# Since we already have age-specific population weights, we can directly aggregate by age
CDF_matrix_mrp <- aggregate(CDF_matrix_mrp, list(age = pop_age$age), sum)
for (i in (2:58)){
names(CDF_matrix_mrp)[i] <- i-2
}
# Continuity correction
CDF_matrix_mrp_cc <- CDF_matrix_mrp
CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc)] <- 0.5*CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc)]+0.5*cbind(rep(0,nrow(CDF_matrix_mrp_cc)),CDF_matrix_mrp_cc[,2:(ncol(CDF_matrix_mrp_cc)-1)])
# Trim outer percentiles
CDF_matrix_mrp[,-1][which(CDF_matrix_mrp[,-1] < 0.0001, arr.ind = TRUE)] <- 0.0001
CDF_matrix_mrp[,-1][which(CDF_matrix_mrp[,-1] > 0.9999, arr.ind = TRUE)] <- 0.9999
# Transform estimated percentiles to estimated normalized z-scores
z_matrix <- qnorm(as.matrix(CDF_matrix_mrp[,-1]))
# Restrict the normalized z-scores to the range [-3; +3]
z_matrix <- apply(z_matrix, 1, function(x) {ifelse(x > 3, 3, x)})
z_matrix <- apply(z_matrix, 1, function(x) {ifelse(x < -3, -3, x)})
z_matrix <- cbind(CDF_matrix_mrp[1],z_matrix) # add age again
ages2 <- seq(11,65,1) # ages for which to predict the test score distribution
pop_age2 <- expand.grid(age = ages, educ = levels(tl_select$educ), mig = levels(tl_select$mig), male = levels(tl_select$male)) # dataframe with all ages
est.values.mrp2 <- predictAll(GAMLSS_MRP_model_spline_sigma2, newdata = pop_age2, type = "response") # Predict distributional parameters all age values values and all adjustment variables
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
## New way of prediction in random() (starting from GAMLSS version 5.0-6)
valid2 <- !is.na(est.values.mrp2$mu)
pop_age2 <- pop_age2[valid2, ]
est.values.mrp2 <- lapply(est.values.mrp2, function(x) x[valid])
# Include age in the population data grouping
population_data2 <- census %>% group_by(age, male, educ, mig) %>%
summarise(census_n = sum(census_n)) %>%
ungroup() %>%
group_by(age) %>%
mutate(Proportion = census_n/sum(census_n)) %>%
ungroup() %>%
mutate(male = as.factor(male)) %>%
select(-census_n)
pop_age2 <- left_join(pop_age2, population_data2, by = c("age", "educ", "mig", "male")) %>%
mutate(Proportion = if_else(is.na(Proportion), 0, Proportion)) # Append the predictions with population percentages
Min_score2 <- 0 # Minimum (adjusted) raw test score for which to calculate percentiles
Max_score2 <- 56 # Maximum (adjusted) raw test score for which to calculate percentiles
step_size2 <- 1
CDF_matrix_mrp2 <- matrix(NA, ncol = (Max_score - Min_score)/step_size + 1, nrow = nrow(pop_age))
for (i in 1:nrow(pop_age2)){
CDF_matrix_mrp2[i,] <- pBB(seq(from = Min_score, to = Max_score, by = step_size), mu = est.values.mrp2$mu[i], sigma = est.values.mrp2$sigma[i], bd = max_score)*pop_age2$Proportion[i]
}
# Since we already have age-specific population weights, we can directly aggregate by age
CDF_matrix_mrp2 <- aggregate(CDF_matrix_mrp2, list(age = pop_age2$age), sum)
for (i in (2:58)){
names(CDF_matrix_mrp2)[i] <- i-2
}
# Continuity correction
CDF_matrix_mrp_cc2 <- CDF_matrix_mrp2
CDF_matrix_mrp_cc2[,2:ncol(CDF_matrix_mrp_cc2)] <- 0.5*CDF_matrix_mrp_cc[,2:ncol(CDF_matrix_mrp_cc2)]+0.5*cbind(rep(0,nrow(CDF_matrix_mrp_cc2)),CDF_matrix_mrp_cc2[,2:(ncol(CDF_matrix_mrp_cc2)-1)])
# Trim outer percentiles
CDF_matrix_mrp2[,-1][which(CDF_matrix_mrp2[,-1] < 0.0001, arr.ind = TRUE)] <- 0.0001
CDF_matrix_mrp2[,-1][which(CDF_matrix_mrp2[,-1] > 0.9999, arr.ind = TRUE)] <- 0.9999
# Transform estimated percentiles to estimated normalized z-scores
z_matrix2 <- qnorm(as.matrix(CDF_matrix_mrp2[,-1]))
# Restrict the normalized z-scores to the range [-3; +3]
z_matrix2 <- apply(z_matrix2, 1, function(x) {ifelse(x > 3, 3, x)})
z_matrix2 <- apply(z_matrix2, 1, function(x) {ifelse(x < -3, -3, x)})
z_matrix2 <- cbind(CDF_matrix_mrp2[1],z_matrix2) # add age again
cdf_df <- as.data.frame(CDF_matrix_mrp)
names(cdf_df)[-1] <- as.integer(names(cdf_df)[-1]) # convert "0", "1", ..., "56" to numeric
cdf_df2 <- as.data.frame(CDF_matrix_mrp2)
names(cdf_df2)[-1] <- as.integer(names(cdf_df2)[-1]) # convert "0", "1", ..., "56" to numeric
cdf_long <- cdf_df %>%
pivot_longer(-age, names_to = "Raw_score", values_to = "percentile") %>%
mutate(Raw_score = as.numeric(Raw_score))
cdf_long2 <- cdf_df2 %>%
pivot_longer(-age, names_to = "Raw_score", values_to = "percentile") %>%
mutate(Raw_score = as.numeric(Raw_score))
# Interpolate raw scores at e6ach target percentile per age
target_p <- c(0.01, 0.05, 0.5, 0.95, 0.99)
GAMLSS_MRP_percentiles <- cdf_long %>%
group_by(age) %>%
arrange(percentile, .by_group = TRUE) %>%
reframe(
p = target_p,
Raw_score = round(approx(x = percentile, y = Raw_score, xout = target_p, ties = "ordered")$y),
.groups = "drop"
) %>%
mutate(
Source = "Without educ",
percentile_label = case_when(
abs(p - 0.01) < 1e-8 ~ "1st percentile",
abs(p - 0.05) < 1e-8 ~ "5th percentile",
abs(p - 0.5) < 1e-8 ~ "50th percentile",
abs(p - 0.95) < 1e-8 ~ "95th percentile",
abs(p - 0.99) < 1e-8 ~ "99th percentile"
),
show_text = FALSE,
line_label = percentile_label
) %>%
select(age, Raw_score, Source, percentile_label, show_text, line_label)
GAMLSS_MRP_percentiles2 <- cdf_long2 %>%
group_by(age) %>%
arrange(percentile, .by_group = TRUE) %>%
reframe(
p = target_p,
Raw_score = round(approx(x = percentile, y = Raw_score, xout = target_p, ties = "ordered")$y),
.groups = "drop"
) %>%
mutate(
Source = "With educ",
percentile_label = case_when(
abs(p - 0.01) < 1e-8 ~ "1st percentile",
abs(p - 0.05) < 1e-8 ~ "5th percentile",
abs(p - 0.5) < 1e-8 ~ "50th percentile",
abs(p - 0.95) < 1e-8 ~ "95th percentile",
abs(p - 0.99) < 1e-8 ~ "99th percentile"
),
show_text = FALSE,
line_label = percentile_label
) %>%
select(age, Raw_score, Source, percentile_label, show_text, line_label)
combined_percentiles <- RPP_percentiles %>% bind_rows(GAMLSS_MRP_percentiles) %>% bind_rows(GAMLSS_MRP_percentiles2)
combined_percentiles$Source <- factor(combined_percentiles$Source, levels = c("RPP", "Without educ", "With educ"))
color_values <- c("RPP" = "#0072B5FF", "Without educ" = "#E18727FF", "With educ" = "#FFDC91FF")
ggplot(combined_percentiles, aes(x = age, y = Raw_score,
color = Source,
group = interaction(Source, percentile_label))) +
geom_step(linewidth = 1, linetype = 1) +
scale_x_continuous(
breaks = seq(min(combined_percentiles$age), 65, by = 2),
expand = expansion(add = c(1, 8))
) +
scale_color_manual(values = color_values, name = "") +
geom_text(
data = combined_percentiles %>% filter(age == 65, show_text, !is.na(line_label)),
aes(label = line_label),
nudge_x = 1,
hjust = 0,
family = "Times"
) +
labs(x = "Age", y = "Test score", color = "") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", linewidth = 0.3),
legend.position = "top"
)
z_df <- as.data.frame(z_matrix)
names(z_df)[-1] <- as.integer(names(z_df)[-1])
z_df2 <- as.data.frame(z_matrix2)
names(z_df2)[-1] <- as.integer(names(z_df2)[-1])
z_long <- z_df %>%
pivot_longer(-age, names_to = "raw_score", values_to = "z") %>%
mutate(
raw_score = as.integer(raw_score),
GAMLSS_MRP_IQ = round(100 + 15 * z, 1) # Convert Z to IQ
) %>%
select(age, raw_score, GAMLSS_MRP_IQ)
z_long2 <- z_df2 %>%
pivot_longer(-age, names_to = "raw_score", values_to = "z") %>%
mutate(
raw_score = as.integer(raw_score),
GAMLSS_s_MRP_IQ = round(100 + 15 * z, 1) # Convert Z to IQ
) %>%
select(age, raw_score, GAMLSS_s_MRP_IQ)
iqs_norming_sample <- tl %>%
select(age = age0100, raw_score = cft) %>%
left_join(means_sds_and_ses_RPP, by = "age") %>%
left_join(z_long, by = c("age", "raw_score")) %>%
left_join(z_long2, by = c("age", "raw_score")) %>%
group_by(age) %>%
mutate(
RPP_IQ_linear = iq(raw_score, RPP_mean, RPP_sd),
dif = RPP_IQ_linear - GAMLSS_MRP_IQ,
absolute_dif = abs(dif)
) %>% ungroup()
# overall_comparison to BB selected model
iqs_norming_sample %>%
summarise(mean_RPP_IQ = mean(RPP_IQ_linear),
mean_GAMLSS_IQ = mean(GAMLSS_MRP_IQ),
mean_dif = mean(dif),
max_dif = max(dif),
mean_absolute_dif = mean(absolute_dif),
RMS_dif = sqrt(mean(dif^2)))
## # A tibble: 1 × 6
## mean_RPP_IQ mean_GAMLSS_IQ mean_dif max_dif mean_absolute_dif RMS_dif
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 103. 104. -1.17 1.54 1.44 2.37
means_ns_sds_and_ses <- iqs_norming_sample %>%
group_by(age) %>%
summarise(RPP_mean = mean(RPP_IQ_linear),
RPP_sd = sd(RPP_IQ_linear),
"Without educ_mean" = mean(GAMLSS_MRP_IQ),
"With educ_mean" = mean(GAMLSS_s_MRP_IQ)) %>%
pivot_longer(-age, names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 1))) +
scale_color_manual(values = color_values, name = "") +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean , ymax = mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 43 & source == "RPP"),
aes(label = source), family = "Times", seed = 810, nudge_y = -1, point.padding = 0, direction = "y") +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 30 & source == "Without educ"),
aes(label = source), family = "Times", seed = 810, nudge_x = 9, point.padding = 1, direction = "y") +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 29 & source == "With educ"),
aes(label = source), family = "Times", seed = 810, nudge_y = 5, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
)+
labs(x = "Age", y = "IQ")
Weighting is often undertaken using simple averages instead of model predictions. Here we apply poststratification weights we calculated based on population data and use weights provided by TwinLife and compare the results to RPP and raw data.
library(survey)
means_sds_and_ses_RPP <- sim_pop_sample_with_draws %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(RPP_mean = mean(mean_prediction),
RPP_se_of_mean = sd(mean_prediction),
RPP_sd = sqrt(mean(sd_prediction^2)),
RPP_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl_ps <- tl %>%
mutate(age = age0100) %>%
left_join((census %>%
mutate(PS_w = census_n / sum(census_n))), by = c("age", "male", "mig", "educ")) %>%
# Set PS weights of cells that exist in the sample but not in the census to 0
mutate(PS_w = ifelse(is.na(PS_w), 0, PS_w))
design_ps <- svydesign(ids = ~1, # No clustering
weights = ~PS_w, # Use renormalized poststratification weights
data = means_sds_and_ses_tl_ps)
means_sds_and_ses_tl_ps <- means_sds_and_ses_tl_ps %>%
group_by(age) %>%
summarise(
Raw_n = n(),
Raw_mean = mean(cft, na.rm = TRUE),
Raw_sd = sd(cft, na.rm = TRUE),
Raw_se_of_mean = Raw_sd / sqrt(Raw_n),
# Weighted statistics using poststratification weights
PS_mean = svymean(~cft, design = subset(design_ps, !is.na(cft) & age == cur_group()$age))[1], # Weighted mean
PS_sd = sqrt(svyvar(~cft, design = subset(design_ps, !is.na(cft) & age == cur_group()$age)))[1], # Weighted SD
PS_se_of_mean = SE(svymean(~cft, design = subset(design_ps, !is.na(cft) & age == cur_group()$age))) # SE of weighted mean
) %>%
ungroup() %>%
mutate_at(vars(-age), ~ as.numeric(.))
means_ns_sds_and_ses <- means_sds_and_ses_RPP %>%
left_join(means_sds_and_ses_tl_ps, by = "age") %>%
pivot_longer(-c(age), names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 4))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses
%>% filter(age == 65) %>% arrange(source),
aes(label = source),
family = "Times", seed = 810, nudge_x = 1, hjust = 0) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c(
"#7876B1FF",
"#BC3C29FF",
"#0072B5FF"
)) +
labs(x = "Age", y = "Mean CFT 20-R Score")
ggsave("figures/10_RPP_vs_raw_PS.jpeg", width = 8, height = 4)
means_ns_sds_and_ses %>% group_by(source) %>% summarise(mean = mean(mean), se = mean(se_of_mean))
## # A tibble: 3 × 3
## source mean se
## <chr> <dbl> <dbl>
## 1 PS 36.9 1.08
## 2 RPP 35.6 0.364
## 3 Raw 36.8 0.847
Here we use the weights provided by the TwinLife panel study data custodians, described here. We compare them with both raw and RPP means.
tl_w_ws <- tl %>%
left_join((read_dta("../unshareable_data/raw/ZA6701_weights_v7-1-0.dta") %>%
select(c(fid, svw0100, svw0200))), by = "fid") %>%
# Rename the weights
mutate(dw = svw0100, # design weights
nrw = svw0200) # non-response weights
# Renormalize the weights in the subsample (tl_w_ws)
tl_w_ws <- tl_w_ws %>%
mutate(
dw_renormalized = dw * (1000 / mean(dw, na.rm = TRUE)), # Renormalize design weights
nrw_renormalized = nrw * (1000 / mean(nrw, na.rm = TRUE)) # Renormalize non-response weights
)
# Design weights
design_dw <- svydesign(ids = ~1, # No clustering
weights = ~dw_renormalized, # Use renormalized design weights
data = tl_w_ws)
# Non-response weights
design_nrw <- svydesign(ids = ~1, # No clustering
weights = ~nrw_renormalized, # Use renormalized non-response weights
data = tl_w_ws)
means_sds_and_ses_RPP <- sim_pop_sample_with_draws %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(RPP_mean = mean(mean_prediction),
RPP_se_of_mean = sd(mean_prediction),
RPP_sd = sqrt(mean(sd_prediction^2)),
RPP_se_of_sd = sd(sd_prediction))
means_sds_and_ses_tl <- tl_w_ws %>%
group_by(age = age0100) %>%
summarise(
Raw_n = n(),
Raw_mean = mean(cft, na.rm = TRUE),
Raw_sd = sd(cft, na.rm = TRUE),
Raw_se_of_mean = Raw_sd / sqrt(Raw_n),
# Weighted statistics using design weights
Design_mean = svymean(~cft, design = subset(design_dw, !is.na(cft) & age0100 == cur_group()$age))[1], # Weighted mean
Design_sd = sqrt(svyvar(~cft, design = subset(design_dw, !is.na(cft) & age0100 == cur_group()$age)))[1], # Weighted SD
Design_se_of_mean = SE(svymean(~cft, design = subset(design_dw, !is.na(cft) & age0100 == cur_group()$age))), # SE of weighted mean
# Weighted statistics using non-response weights
Nonresponse_mean = svymean(~cft, design = subset(design_nrw, !is.na(cft) & age0100 == cur_group()$age))[1], # Weighted mean
Nonresponse_sd = sqrt(svyvar(~cft, design = subset(design_nrw, !is.na(cft) & age0100 == cur_group()$age)))[1], # Weighted SD
Nonresponse_se_of_mean = SE(svymean(~cft, design = subset(design_nrw, !is.na(cft) & age0100 == cur_group()$age))) # SE of weighted mean
) %>%
ungroup() %>%
mutate_at(vars(-age), ~ as.numeric(.))
means_ns_sds_and_ses <- means_sds_and_ses_RPP %>%
left_join(means_sds_and_ses_tl, by = "age") %>%
pivot_longer(-c(age, Raw_n), names_to = c("source", ".value"), names_pattern = "([^_]*)_(.*)")
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = mean, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 9))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = mean - 1.96*se_of_mean, ymax = mean + 1.96*se_of_mean),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = source), family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c(
"#7876B1FF",
"#6F99ADFF",
"#BC3C29FF",
"#0072B5FF"
)) +
labs(x = "Age", y = "Mean CFT 20-R Score")
ggsave("figures/11_RPP_vs_TL.jpeg", width = 8, height = 4)
No large differences to raw means.
means_ns_sds_and_ses %>%
group_by(source) %>%
summarise(mean = mean(mean), se = mean(se_of_mean))
## # A tibble: 4 × 3
## source mean se
## <chr> <dbl> <dbl>
## 1 Design 36.9 0.958
## 2 Nonresponse 36.4 0.916
## 3 RPP 35.6 0.364
## 4 Raw 36.8 0.847
Overall, non response weighting also recognised an overestimation and corrected for it, but not as much as RPP.
means_ns_sds_and_ses %>%
ggplot(aes(x = as.factor(age), y = sd, group = source, colour = source)) +
scale_x_discrete(breaks = seq(11, 65, by = 2),
expand = expansion(add = c(1, 9))) +
geom_line(linewidth = 1) +
geom_pointrange(shape = 18,
aes(ymin = sd - 1.96*se_of_sd, ymax = sd + 1.96*se_of_sd),
fatten = 3,
linewidth = 1,
position = position_dodge(width = 0.3)) +
geom_text_repel(
data = means_ns_sds_and_ses %>% filter(age == 65) %>% arrange(source),
aes(label = source),
family = "Times", seed = 810, nudge_x = 1, hjust = 0, point.padding = 1, direction = "y") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "none"
) +
scale_colour_manual(values = c(
"#7876B1FF",
"#6F99ADFF",
"#BC3C29FF",
"#0072B5FF"
)) + labs(x = "Age", y = "CFT 20-R Score SD")
means_ns_sds_and_ses %>%
group_by(source) %>%
summarise(sd = sqrt(mean(sd^2))) %>%
arrange(sd)
## # A tibble: 4 × 2
## source sd
## <chr> <dbl>
## 1 Design 7.92
## 2 Raw 8.12
## 3 RPP 8.23
## 4 Nonresponse 8.29
No large differences in SDs.
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid parallel splines stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] survey_4.4-2 survival_3.8-3 Matrix_1.7-1 foreign_0.8-87 cNORM_3.4.0
## [6] ggnewscale_0.5.1 gamlss.tr_5.1-9 gamlss_5.4-22 nlme_3.1-166 gamlss.dist_6.1-1
## [11] gamlss.data_6.0-6 rstan_2.32.6 StanHeaders_2.32.10 kableExtra_1.4.0 scales_1.3.0
## [16] ggrepel_0.9.6 haven_2.5.4 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [21] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [26] ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gridExtra_2.3 inline_0.3.20 sandwich_3.1-1 rlang_1.1.4
## [6] magrittr_2.0.3 multcomp_1.4-26 matrixStats_1.5.0 compiler_4.4.1 loo_2.8.0
## [11] systemfonts_1.1.0 vctrs_0.6.5 pkgconfig_2.0.3 fastmap_1.2.0 backports_1.5.0
## [16] labeling_0.4.3 utf8_1.2.4 cmdstanr_0.8.0 rmarkdown_2.27 tzdb_0.4.0
## [21] ps_1.7.6 ragg_1.3.2 xfun_0.45 cachem_1.1.0 jsonlite_1.8.8
## [26] highr_0.11 R6_2.5.1 bslib_0.7.0 stringi_1.8.4 jquerylib_0.1.4
## [31] estimability_1.5.1 Rcpp_1.0.12 knitr_1.47 zoo_1.8-12 R.utils_2.12.3
## [36] bayesplot_1.11.1 timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.16.0 abind_1.4-8
## [41] yaml_2.3.8 codetools_0.2-20 processx_3.8.4 pkgbuild_1.4.4 lattice_0.22-6
## [46] withr_3.0.0 bridgesampling_1.1-2 posterior_1.6.0 coda_0.19-4.1 evaluate_0.24.0
## [51] RcppParallel_5.1.9 xml2_1.3.6 pillar_1.9.0 tensorA_0.36.2.1 checkmate_2.3.2
## [56] DT_0.33 stats4_4.4.1 distributional_0.5.0 generics_0.1.3 hms_1.1.3
## [61] rstantools_2.4.0 munsell_0.5.1 xtable_1.8-4 leaps_3.2 glue_1.7.0
## [66] emmeans_1.10.6 tools_4.4.1 data.table_1.16.4 mvtnorm_1.3-2 mitools_2.4
## [71] QuickJSR_1.5.1 crosstalk_1.2.1 colorspace_2.1-1 cli_3.6.3 textshaping_0.4.0
## [76] fansi_1.0.6 viridisLite_0.4.2 svglite_2.1.3 Brobdingnag_1.2-9 gtable_0.3.6
## [81] R.methodsS3_1.8.2 sass_0.4.9 digest_0.6.36 TH.data_1.1-2 brms_2.22.0
## [86] htmlwidgets_1.6.4 farver_2.1.2 R.oo_1.27.0 htmltools_0.5.8.1 lifecycle_1.0.4
## [91] MASS_7.3-64