knitr::opts_chunk$set(
message = FALSE,
warning = T,
include = TRUE,
error = TRUE,
fig.width = 8,
fig.height = 4
)
library(tidyverse)
library(haven)
library(data.table)
library(ggrepel)
library(kableExtra)
library(brms)
library(tidybayes)
library(marginaleffects)
library(bayesplot)
library(rstan)
# depending on the platform on which you want to run the brm you might need this or not. We ran the models on a Linux-operated server
options(mc.cores = 4,
brms.backend = "cmdstanr",
scipen = 999,
digits = 4,
width = 120)
# windowsFonts(Times = windowsFont("Times New Roman"))
theme_set(theme_minimal(base_size = 12, base_family = "Times"))
## load preprocessed datasets (see preprocessing code below)
sb_data <- readRDS("data/preprocessed/wordsum/sb_data.rds")
census_sb <- readRDS("data/preprocessed/us_census/census_sb.rds")
# RPP/RPP function
source("age_norm_comparisons.R")
# load dataset after basic cleaning and labelling (see github.com/synth-science/synth-rep-dataset for the raw data and basic cleaning code)
sb_raw <- read_rds("data/raw/sb/sosci_labelled.rds")
OCC_classification <- readxl::read_xlsx(path = "data/raw/sb/OCC_classification.xlsx", sheet = "classification_table" ) %>%
mutate(occ = as.character(occ_occupation))
# load the degfield classification table
degfield_hierarchy <- readxl::read_excel("data/raw/sb/degfield_hierarchy.xlsx") %>% # contains a classification of the general codes into larger categories based on this wikipedia article: https://en.wikipedia.org/wiki/Outline_of_academic_disciplines
mutate(
degfield_label = str_to_lower(degfield_label),
degfield_category = str_to_lower(degfield_category),
degfield_alternative_category = str_to_lower(degfield_alternative_category))
divisions <- list(
new_england = c("connecticut", "maine", "massachusetts", "new hampshire", "rhode island", "vermont"),
middle_atlantic = c("new jersey", "new york", "pennsylvania"),
east_north_central = c("illinois", "indiana", "michigan", "ohio", "wisconsin"),
west_north_central = c("iowa", "kansas", "minnesota", "missouri", "nebraska", "north dakota", "south dakota"),
south_atlantic = c("delaware", "district of columbia", "florida", "georgia", "maryland",
"north carolina", "south carolina", "virginia", "west virginia"),
east_south_central = c("alabama", "kentucky", "mississippi", "tennessee"),
west_south_central = c("arkansas", "louisiana", "oklahoma/indian territory", "oklahoma", "texas"),
mountain = c("arizona", "colorado", "idaho", "montana", "nevada", "new mexico", "utah", "wyoming"),
pacific = c("alaska", "california", "hawaii", "oregon", "washington")
)
sb_data <- sb_raw %>%
select(CASE,
starts_with("SD"),
gender,
starts_with("WS")) %>%
rename(educ = SD08,
occ = SD10) %>%
setNames(sapply(names(.), function(x) {
label <- attr(sb_raw[[x]], "label", exact = TRUE)
if (!is.null(label)) label else x
})) %>%
rename_all(tolower) %>%
rename(age = "age: [01]",
inctot = "inctot: [01]") %>%
left_join(OCC_classification %>% select(occ, occ_category),
by = "occ") %>% # to classify the occ variable merge with the OCC_classification df
mutate(across(where(~ is.character(.) || is.factor(.) || inherits(., "labelled")),
~ if (is.factor(.)) {
factor(tolower(as.character(.)))
} else if (inherits(., "labelled")) {
haven::labelled(tolower(as.character(.)), attr(., "labels"))
} else {
tolower(.)
})) %>%
left_join(degfield_hierarchy %>% select(degfield_label, degfield_category), # to categorize degfield merge with degfield_hierarchy
by = c("degfield" = "degfield_label")) %>%
rename_all(~ gsub(" ", "_", .)) %>%
rename_all(~ gsub("-", "", .)) %>%
mutate(
region_birth = case_when(
bpl_state %in% divisions$new_england ~ "new england division",
bpl_state %in% divisions$middle_atlantic ~ "middle atlantic division",
bpl_state %in% divisions$east_north_central ~ "east north central division",
bpl_state %in% divisions$west_north_central ~ "west north central division",
bpl_state %in% divisions$south_atlantic ~ "south atlantic division",
bpl_state %in% divisions$east_south_central ~ "east south central division",
bpl_state %in% divisions$west_south_central ~ "west south central division",
bpl_state %in% divisions$mountain ~ "mountain division",
bpl_state %in% divisions$pacific ~ "pacific division",
TRUE ~ "abroad"
),
region_residence = case_when(
state %in% divisions$new_england ~ "new england division",
state %in% divisions$middle_atlantic ~ "middle atlantic division",
state %in% divisions$east_north_central ~ "east north central division",
state %in% divisions$west_north_central ~ "west north central division",
state %in% divisions$south_atlantic ~ "south atlantic division",
state %in% divisions$east_south_central ~ "east south central division",
state %in% divisions$west_south_central ~ "west south central division",
state %in% divisions$mountain ~ "mountain division",
state %in% divisions$pacific ~ "pacific division",
TRUE ~ "abroad"
),
income_brackets = cut(as.integer(inctot),
breaks = c(-Inf, 1000, 6000, 12500, 22500, 35000, 50000, 60000, 75000, 90000, 110000, Inf),
ordered_result = FALSE,
labels = c("< $1,000", "$1,000 - $5,999", "$6,000 - $12,499", "$12,500 - $22,499",
"$22,500 - $34,999", "$35,000 - $49,999", "$50,000 - $59,999",
"$60,000 - $74,999", "$75,000 - $89,999", "$90,000 - $109,999", ">$110,000")),
educ = case_when(
educ %in% c("n/a or no schooling",
"grade 5, 6, 7, or 8",
"grade 9",
"grade 10",
"grade 11",
"grade 12") ~ "high school or less",
educ == "3 years of college" ~ "2 years of college",
TRUE ~ educ # Retain other categories as they are
),
female = sex == "female",
hispan = hispan != "no, not of hispanic, latino, or spanish origin",
marst = case_when(
marst %in% c("divorced", "separated", "widowed") ~ "separated, widowed, or divorced",
TRUE ~ marst # Keep other categories as is
),
race = case_when(
race %in% c("chinese",
"japanese",
"other asian or pacific islander",
"american indian or alaska native",
"other race") ~ "other",
race %in% c("three or more major races",
"two major races (e.g., white and japanese)") ~ "two or more major races",
TRUE ~ race # Keep other categories as is
),
degfield_branch = case_when(
degfield_category %in% c("formal sciences", "interdisciplinary and multi-disciplinary studies (general)", "natural sciences") ~ "natural, formal, and other sciences",
is.na(degfield_category) ~ "n/a",
TRUE ~ degfield_category
),
occ_category = case_when(
occ_category %in% c("Natural Resources, Construction, and Maintenance Occupations", "Service Occupations") ~
"other",
is.na(occ_category) ~ "n/a: unemployed",
TRUE ~ occ_category
),
wordsum_1 = if_else(wordsum_1 == 4, 1, 0),
wordsum_2 = if_else(wordsum_2 == 5, 1, 0),
wordsum_3 = if_else(wordsum_3 == 5, 1, 0),
wordsum_4 = if_else(wordsum_4 == 3, 1, 0),
wordsum_5 = if_else(wordsum_5 == 1, 1, 0),
wordsum_6 = if_else(wordsum_6 == 3, 1, 0),
wordsum_7 = if_else(wordsum_7 == 5, 1, 0),
wordsum_8 = if_else(wordsum_8 == 4, 1, 0),
wordsum_9 = if_else(wordsum_9 == 4, 1, 0),
wordsum_10 = if_else(wordsum_10 == 1, 1, 0),
wordsum = wordsum_1 + wordsum_2 + wordsum_3 + wordsum_4 + wordsum_5 + wordsum_6 + wordsum_7 + wordsum_8 + wordsum_9 + wordsum_10,
wordsum_ordinal = as.ordered(wordsum),
censoring = case_when(
wordsum == 0 ~ "left",
wordsum == 10 ~ "right",
TRUE ~ "none"
)
) %>%
select(CASE = case, age, female, educ, income_brackets, income = inctot, race, hispan, region_residence, state, region_birth, degfield_branch, marst, occ_category, starts_with("wordsum"), censoring) %>%
filter(complete.cases(.)) %>%
left_join(sb_raw, by = "CASE") %>%
select(-ends_with("_R"))
saveRDS(sb_data, "data/preprocessed/wordsum/sb_data.rds")
ggplot(sb_data, aes(x = wordsum)) +
geom_histogram(bins = 11, fill = "blue", color = "black") + # Set bins to 11 for values 0 to 10
scale_x_continuous(breaks = 0:10) + # Set x-axis breaks for each possible value
labs(x = "Wordsum score",
y = "Frequency") + theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.background = element_rect(fill = "transparent", color = NA),
# panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
geom_text(stat = 'count', aes(label = after_stat(count)),
vjust = -0.5, # Adjust vertical position of labels
color = "black") # Color of the labels
census_sb_raw <- setDT(read_dta("data/raw/us_census/usa_00015.dta.gz"))
OCC_classification <- readxl::read_xlsx(path = "data/raw/sb/OCC_classification.xlsx", sheet = "classification_table" ) %>%
mutate(occ = as.character(occ),
occ_category = tolower(as.character(occ_category))) # change class to character
# Convert labelled columns to factors
census_sb_raw <- census_sb_raw %>%
mutate_if(haven::is.labelled, as_factor)
# Convert to data.table
census_sb_raw <- as.data.table(census_sb_raw)
# Define income brackets
income_breaks <- c(-Inf, 1000, 6000, 12500, 22500, 35000, 50000, 60000, 75000, 90000, 110000, Inf)
income_labels <- c("< $1,000", "$1,000 - $5,999", "$6,000 - $12,499", "$12,500 - $22,499",
"$22,500 - $34,999", "$35,000 - $49,999", "$50,000 - $59,999",
"$60,000 - $74,999", "$75,000 - $89,999", "$90,000 - $109,999", ">$110,000")
# Main processing
census_sb <- census_sb_raw[, `:=`(
income_brackets = cut(as.integer(inctot), breaks = income_breaks, labels = income_labels, ordered_result = TRUE)
)][, .(census_n = sum(perwt)), by = .(state = statefip, sex, age, marst, race, hispan, bpl, educ, degfield, occ, income_brackets)]
# Convert columns to factors and lowercase
factor_cols <- c("state", "sex", "marst", "race", "hispan", "bpl", "educ", "degfield", "occ", "income_brackets")
census_sb[, (factor_cols) := lapply(.SD, function(x) factor(tolower(as.character(x)))), .SDcols = factor_cols]
OCC_classification <- as.data.table(OCC_classification)
degfield_hierarchy <- as.data.table(degfield_hierarchy)
# Join with OCC_classification and degfield_hierarchy
census_sb <- merge(census_sb, OCC_classification[, .(occ, occ_category)], by = "occ", all.x = TRUE)
census_sb <- merge(census_sb, degfield_hierarchy[, .(degfield_label, degfield_category)], by.x = "degfield", by.y = "degfield_label", all.x = TRUE)
# Apply transformations
census_sb[, `:=`(
region_birth = fcase(
bpl %in% divisions$new_england, "new england division",
bpl %in% divisions$middle_atlantic, "middle atlantic division",
bpl %in% divisions$east_north_central, "east north central division",
bpl %in% divisions$west_north_central, "west north central division",
bpl %in% divisions$south_atlantic, "south atlantic division",
bpl %in% divisions$east_south_central, "east south central division",
bpl %in% divisions$west_south_central, "west south central division",
bpl %in% divisions$mountain, "mountain division",
bpl %in% divisions$pacific, "pacific division",
default = "abroad"
),
region_residence = fcase(
state %in% divisions$new_england, "new england division",
state %in% divisions$middle_atlantic, "middle atlantic division",
state %in% divisions$east_north_central, "east north central division",
state %in% divisions$west_north_central, "west north central division",
state %in% divisions$south_atlantic, "south atlantic division",
state %in% divisions$east_south_central, "east south central division",
state %in% divisions$west_south_central, "west south central division",
state %in% divisions$mountain, "mountain division",
state %in% divisions$pacific, "pacific division",
default = "abroad"
),
age = as.numeric(age),
female = sex == "female",
marst = fcase(
marst %in% c("married, spouse present", "married, spouse absent"), "married",
marst %in% c("divorced", "separated", "widowed"), "separated, widowed, or divorced",
default = as.character(marst)
),
educ = fcase(
educ %in% c("nursery school to grade 4", "n/a or no schooling", "grade 5, 6, 7, or 8", "grade 9", "grade 10", "grade 11", "grade 12"), "high school or less",
default = as.character(educ)
),
hispan = hispan != "not hispanic",
race = fcase(
race %in% c("chinese", "japanese", "other asian or pacific islander", "american indian or alaska native", "other race, nec"), "other",
race %in% c("three or more major races", "two major races"), "two or more major races",
default = as.character(race)
),
degfield_branch = fcase(
degfield_category %in% c("formal sciences", "interdisciplinary and multi-disciplinary studies (general)", "natural sciences"), "natural, formal, and other sciences",
is.na(degfield_category), "n/a",
default = as.character(degfield_category)
),
occ_category = fcase(
occ_category %in% c("Natural Resources, Construction, and Maintenance Occupations", "Service Occupations"), "other",
default = as.character(occ_category)
)
)]
# Select and filter
census_sb <- census_sb[, .SD, .SDcols = !c("degfield", "bpl", "state", "occ", "degfield_category", "sex")]
census_sb <- census_sb[between(age, 21, 67)]
# Convert remaining columns to factors
factor_cols <- names(census_sb)[sapply(census_sb, is.character)]
census_sb[, (factor_cols) := lapply(.SD, as.factor), .SDcols = factor_cols]
census_sb <- census_sb %>%
relocate(census_n, .after = "degfield_branch") %>%
group_by(age, female, educ, income_brackets, race, hispan, region_residence, region_birth, degfield_branch, marst, occ_category) %>%
summarise(census_n = sum(census_n)) %>%
ungroup()
saveRDS(census_sb, "data/preprocessed/us_census/census_sb.rds")
census_sb %>% head(14) %>% kable(digits = 2) %>% kable_styling(full_width = FALSE)
age | female | educ | income_brackets | race | hispan | region_residence | region_birth | degfield_branch | marst | occ_category | census_n |
---|---|---|---|---|---|---|---|---|---|---|---|
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | abroad | n/a | never married/single | n/a: unemployed | 321 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | abroad | n/a | never married/single | sales and office occupations | 29 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | east north central division | n/a | never married/single | management, business, science, and arts occupations | 122 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | east north central division | n/a | never married/single | n/a: unemployed | 2776 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | east north central division | n/a | never married/single | production, transportation, and material moving occupations | 305 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | east north central division | n/a | never married/single | sales and office occupations | 400 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | east north central division | n/a | never married/single | service occupations | 567 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | middle atlantic division | n/a | never married/single | n/a: unemployed | 31 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | new england division | n/a | never married/single | sales and office occupations | 35 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | pacific division | n/a | never married/single | service occupations | 2 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | south atlantic division | n/a | never married/single | n/a: unemployed | 123 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | south atlantic division | n/a | never married/single | sales and office occupations | 67 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | west north central division | n/a | never married/single | production, transportation, and material moving occupations | 21 |
21 | FALSE | 1 year of college | < $1,000 | black/african american | FALSE | east north central division | west north central division | n/a | never married/single | sales and office occupations | 22 |
cat_sizes_census <- census_sb %>%
select(-age, -census_n) %>%
map_dfr(~ count(tibble(Category = as.character(.x)), Category),
.id = "Variable") %>%
arrange(Variable, Category)
cat_sizes_sb <- sb_data %>%
select(-age, -wordsum, -CASE) %>%
map_dfr(~ count(tibble(Category = as.character(.x)), Category),
.id = "Variable") %>%
arrange(Variable, Category)
anti_join(cat_sizes_census, cat_sizes_sb, by = "Category")
## # A tibble: 0 × 3
## # ℹ 3 variables: Variable <chr>, Category <chr>, n <int>
Before applying RPP to correct estimates (be it for norming or other purposes), it’s worthwhile to check if there are any differences between the sample and the population (on which we have data from the census) with respect to the variables we wish to use for the correction.
aggregated_sample <- sb_data %>%
filter(between(age, 21, 67)) %>%
mutate(across(-c(starts_with("wordsum"), censoring, CASE, age, female, hispan, income_brackets), as.factor)) %>%
group_by(age, female, educ, income_brackets, race, hispan, region_residence, region_birth, degfield_branch, marst, occ_category) %>%
summarise(sample_n = n()) %>% ungroup()
disparities_plot <- census_sb %>%
full_join(aggregated_sample, by = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category")) %>%
mutate(sample_n = replace_na(sample_n, 0),
census_n = replace_na(census_n, 0),
occ_category = case_when(
occ_category == "management, business, science, and arts occupations" ~ "MBSA",
occ_category == "sales and office occupations" ~ "SO",
occ_category == "production, transportation, and material moving occupations" ~ "PTMM",
occ_category == "service occupations" ~ "S",
occ_category == "natural resources, construction, and maintenance occupations" ~ "NRCM",
TRUE ~ occ_category
)) %>%
pivot_longer(cols = c(census_n:sample_n), names_to = "source", values_to = "n") %>%
mutate(source = str_sub(source, 1, -3),
n = replace_na(n, 0)) %>%
mutate(across(-c(n), as.factor)) %>%
pivot_longer(cols = -c(n, source), names_to = "variable", values_to = "category") %>%
group_by(source, variable, category) %>%
summarise(n = sum(n)) %>%
mutate(percentage = n/sum(n)*100)
# plot age disparities
disparities_plot %>%
filter(variable == "age") %>%
ggplot(aes(x = category, y = percentage, group = source, colour = source, label = round(percentage, digits = 1))) +
theme_minimal(base_size = 9, base_family = "Times") +
geom_line(linewidth = .5) +
facet_grid(cols = vars(variable), scales = "free", space = 'free', as.table = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
# legend.justification = c(1, 1),
legend.background = element_rect(fill = "transparent", color = NA),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3)) +
labs(x = "", y = "Percentage", colour = "", label = "Percentage") +
scale_colour_manual(values = c("#0072B5FF", "#BC3C29FF"), labels = function(x) str_to_title(x)) +
scale_x_discrete(breaks = seq(21, 67, by = 2))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# ggsave("p.jpeg", height = 2, width = 10)
disparities_plot %>%
mutate(category = case_when(
variable == "educ" ~ factor(category, levels = c("high school or less", "1 year of college", "2 years of college", "4 years of college", "5+ years of college")),
variable == "income_brackets" ~ factor(category, levels = c("< $1,000", "$1,000 - $5,999", "$6,000 - $12,499", "$12,500 - $22,499", "$22,500 - $34,999", "$35,000 - $49,999", "$50,000 - $59,999", "$60,000 - $74,999", "$75,000 - $89,999", "$90,000 - $109,999", ">$110,000")),
variable == "degfield_branch" ~ factor(category, levels = c("n/a", "humanities", "social sciences", "applied sciences and professions", "natural, formal, and other sciences")),
variable == "occ_category" ~ factor(category, levels = c("n/a: unemployed", "MBSA", "SO", "S", "NRCM", "PTMM")),
variable == "race" ~ factor(category, levels = c("white", "black/african american", "two or more major races", "other")),
variable == "region_birth" ~ factor(category, levels = c("abroad", "pacific division", "middle atlantic division", "new england division", "mountain division", "east south central division", "south atlantic division", "west south central division", "east north central division", "west north central division")),
TRUE ~ category
)) %>%
filter(variable != "age") %>%
ggplot(aes(x = category, y = percentage, group = source, colour = source, label = round(percentage, digits = 1))) +
theme_minimal(base_size = 9, base_family = "Times") +
geom_line(linewidth = .5) +
facet_grid(cols = vars(variable), scales = "free", space = 'free') +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.position = c(1, 1.08),
legend.justification = c(1, 1),
legend.background = element_rect(fill = "transparent", color = NA),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
plot.margin = margin(0, 0, 0, .05, "cm")) +
labs(x = "Category", y = "Percentage", colour = "", label = "Percentage") +
scale_colour_manual(values = c("#0072B5FF", "#BC3C29FF"), labels = function(x) str_to_title(x))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# ggsave("p.jpeg", height = 3, width = 9)
Terms of this form (1 | variable)
represent random
intercepts, female
and hispan
were included as
a fixed effect to avoid estimation problems due to the small number of
categories, s(age)
is a thin plate spline of age.
brm_1 <- brm(bf(wordsum ~ s(age, by = educ) + (1 | educ) + female + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + (1 | income_brackets) +
(1 | educ:race) +
(1 | educ:hispan) +
(1 | educ:region_residence) +
(1 | educ:marst) +
(1 | educ:degfield_branch) +
(1 | educ:occ_category) +
(1 | educ:region_birth) +
(1 | educ:income_brackets) +
(1 | race:hispan) +
(1 | race:region_residence) +
(1 | race:marst) +
(1 | race:degfield_branch) +
(1 | race:occ_category) +
(1 | race:region_birth) +
(1 | race:income_brackets) +
(1 | hispan:region_residence) +
(1 | hispan:marst) +
(1 | hispan:degfield_branch) +
(1 | hispan:occ_category) +
(1 | hispan:region_birth) +
(1 | hispan:income_brackets) +
(1 | region_residence:marst) +
(1 | region_residence:degfield_branch) +
(1 | region_residence:occ_category) +
(1 | region_residence:region_birth) +
(1 | region_residence:income_brackets) +
(1 | marst:degfield_branch) +
(1 | marst:occ_category) +
(1 | marst:region_birth) +
(1 | marst:income_brackets) +
(1 | degfield_branch:occ_category) +
(1 | degfield_branch:region_birth) +
(1 | degfield_branch:income_brackets) +
(1 | occ_category:region_birth) +
(1 | occ_category:income_brackets) +
(1 | region_birth:income_brackets) +
(1 | female:educ) +
(1 | female:race) +
(1 | female:hispan) +
(1 | female:region_residence) +
(1 | female:marst) +
(1 | female:degfield_branch) +
(1 | female:occ_category) +
(1 | female:region_birth) +
(1 | female:income_brackets),
sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
file = "data/brms/wordsum/sb/brm_1",
data = sb_data) %>%
add_criterion("loo")
brm_1
## Warning: There were 76 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum ~ s(age, by = educ) + (1 | educ) + female + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + (1 | income_brackets) + (1 | educ:race) + (1 | educ:hispan) + (1 | educ:region_residence) + (1 | educ:marst) + (1 | educ:degfield_branch) + (1 | educ:occ_category) + (1 | educ:region_birth) + (1 | educ:income_brackets) + (1 | race:hispan) + (1 | race:region_residence) + (1 | race:marst) + (1 | race:degfield_branch) + (1 | race:occ_category) + (1 | race:region_birth) + (1 | race:income_brackets) + (1 | hispan:region_residence) + (1 | hispan:marst) + (1 | hispan:degfield_branch) + (1 | hispan:occ_category) + (1 | hispan:region_birth) + (1 | hispan:income_brackets) + (1 | region_residence:marst) + (1 | region_residence:degfield_branch) + (1 | region_residence:occ_category) + (1 | region_residence:region_birth) + (1 | region_residence:income_brackets) + (1 | marst:degfield_branch) + (1 | marst:occ_category) + (1 | marst:region_birth) + (1 | marst:income_brackets) + (1 | degfield_branch:occ_category) + (1 | degfield_branch:region_birth) + (1 | degfield_branch:income_brackets) + (1 | occ_category:region_birth) + (1 | occ_category:income_brackets) + (1 | region_birth:income_brackets) + (1 | female:educ) + (1 | female:race) + (1 | female:hispan) + (1 | female:region_residence) + (1 | female:marst) + (1 | female:degfield_branch) + (1 | female:occ_category) + (1 | female:region_birth) + (1 | female:income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sageeduc1yearofcollege_1) 1.73 1.63 0.06 6.02 1.00 2623 2568
## sds(sageeduc2yearsofcollege_1) 1.61 1.40 0.06 5.39 1.00 2025 2283
## sds(sageeduc4yearsofcollege_1) 1.62 1.65 0.05 6.15 1.00 1583 2159
## sds(sageeduc5Pyearsofcollege_1) 1.86 1.29 0.11 5.06 1.00 1896 1219
## sds(sageeduchighschoolorless_1) 1.29 1.16 0.06 4.19 1.00 2372 2386
## sds(sigma_sage_1) 1.14 0.88 0.07 3.30 1.00 1392 1883
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.39 0.41 0.01 1.41 1.00 2495 2007
## sd(sigma_Intercept) 0.13 0.15 0.00 0.51 1.00 1812 2667
##
## ~degfield_branch:income_brackets (Number of levels: 52)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.16 0.11 0.01 0.42 1.00 1682 1951
##
## ~degfield_branch:occ_category (Number of levels: 24)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.36 0.20 0.04 0.81 1.00 1498 1385
##
## ~degfield_branch:region_birth (Number of levels: 47)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.10 0.01 0.37 1.00 2055 2503
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.43 0.42 0.01 1.58 1.00 2863 2551
## sd(sigma_Intercept) 0.11 0.13 0.00 0.45 1.00 1866 2557
##
## ~educ:degfield_branch (Number of levels: 17)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.28 0.22 0.01 0.80 1.00 1537 2010
##
## ~educ:hispan (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.33 0.26 0.01 1.00 1.00 1972 2508
##
## ~educ:income_brackets (Number of levels: 53)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.10 0.01 0.37 1.00 1699 2007
##
## ~educ:marst (Number of levels: 15)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.23 0.19 0.01 0.68 1.00 1368 2369
##
## ~educ:occ_category (Number of levels: 28)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.28 0.17 0.02 0.67 1.00 1560 2061
##
## ~educ:race (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.31 0.20 0.02 0.75 1.00 1191 1967
##
## ~educ:region_birth (Number of levels: 48)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.11 0.09 0.00 0.33 1.00 2069 1655
##
## ~educ:region_residence (Number of levels: 43)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.10 0.00 0.37 1.00 1777 2289
##
## ~female:degfield_branch (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.23 0.20 0.01 0.72 1.00 2170 1890
##
## ~female:educ (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.26 0.20 0.01 0.78 1.00 1910 2132
##
## ~female:hispan (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.85 1.00 0.02 3.41 1.00 1631 1540
##
## ~female:income_brackets (Number of levels: 22)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.19 0.13 0.01 0.51 1.00 1551 2248
##
## ~female:marst (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.37 0.40 0.01 1.45 1.00 1818 1726
##
## ~female:occ_category (Number of levels: 12)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.27 0.25 0.01 0.94 1.00 2019 2098
##
## ~female:race (Number of levels: 8)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.41 0.33 0.02 1.22 1.00 1407 1874
##
## ~female:region_birth (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.12 0.09 0.00 0.35 1.00 2523 2453
##
## ~female:region_residence (Number of levels: 18)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.11 0.00 0.42 1.00 2679 2459
##
## ~hispan:degfield_branch (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.23 0.21 0.01 0.80 1.00 2628 1960
##
## ~hispan:income_brackets (Number of levels: 22)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.15 0.11 0.01 0.43 1.00 2413 2571
##
## ~hispan:marst (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.47 0.46 0.02 1.64 1.00 2343 2559
##
## ~hispan:occ_category (Number of levels: 12)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.32 0.27 0.01 0.96 1.00 1866 1821
##
## ~hispan:region_birth (Number of levels: 19)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.11 0.01 0.42 1.00 2942 2314
##
## ~hispan:region_residence (Number of levels: 18)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.26 0.18 0.01 0.68 1.00 1671 1833
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.16 0.13 0.01 0.49 1.00 2596 2346
## sd(sigma_Intercept) 0.08 0.06 0.00 0.23 1.00 1850 2090
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.81 0.86 0.03 3.13 1.00 3015 2144
## sd(sigma_Intercept) 0.57 0.60 0.08 2.41 1.00 2125 1688
##
## ~marst:degfield_branch (Number of levels: 15)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.17 0.14 0.01 0.52 1.00 2088 2167
##
## ~marst:income_brackets (Number of levels: 33)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.15 0.11 0.01 0.41 1.00 1963 2235
##
## ~marst:occ_category (Number of levels: 17)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.18 0.16 0.01 0.58 1.00 2181 2030
##
## ~marst:region_birth (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.10 0.01 0.37 1.00 1880 2035
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.40 0.37 0.01 1.34 1.00 2860 2434
## sd(sigma_Intercept) 0.20 0.19 0.01 0.69 1.01 1394 1796
##
## ~occ_category:income_brackets (Number of levels: 48)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.16 0.12 0.01 0.43 1.00 1761 1907
##
## ~occ_category:region_birth (Number of levels: 46)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.11 0.09 0.00 0.32 1.00 2029 1729
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.50 0.53 0.01 2.04 1.00 2554 2168
## sd(sigma_Intercept) 0.17 0.24 0.01 0.79 1.00 1947 2370
##
## ~race:degfield_branch (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.17 0.13 0.01 0.51 1.00 2331 2453
##
## ~race:hispan (Number of levels: 8)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.26 0.27 0.01 0.98 1.00 2428 2607
##
## ~race:income_brackets (Number of levels: 44)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.20 0.14 0.01 0.53 1.00 1440 1826
##
## ~race:marst (Number of levels: 12)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.33 0.26 0.01 0.97 1.00 1438 1849
##
## ~race:occ_category (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.31 0.23 0.02 0.87 1.00 1528 2533
##
## ~race:region_birth (Number of levels: 34)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.10 0.01 0.37 1.00 2495 2492
##
## ~race:region_residence (Number of levels: 33)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.16 0.11 0.01 0.42 1.00 2303 2563
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.16 0.14 0.01 0.53 1.00 2482 1973
## sd(sigma_Intercept) 0.10 0.08 0.00 0.28 1.00 1801 2580
##
## ~region_birth:income_brackets (Number of levels: 104)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.31 0.16 0.02 0.62 1.00 752 1376
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.19 0.16 0.01 0.61 1.00 2868 2362
## sd(sigma_Intercept) 0.12 0.09 0.00 0.34 1.00 1436 1850
##
## ~region_residence:degfield_branch (Number of levels: 43)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.20 0.14 0.01 0.51 1.00 1139 1648
##
## ~region_residence:income_brackets (Number of levels: 96)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.25 0.15 0.01 0.56 1.00 801 1665
##
## ~region_residence:marst (Number of levels: 27)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.17 0.12 0.01 0.46 1.00 1571 2026
##
## ~region_residence:occ_category (Number of levels: 42)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.16 0.12 0.01 0.45 1.00 1835 1807
##
## ~region_residence:region_birth (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.22 0.15 0.01 0.55 1.00 936 1752
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.24 1.34 4.57 9.96 1.00 1996 1393
## sigma_Intercept 0.35 0.51 -0.68 1.41 1.00 1567 1254
## femaleTRUE 0.06 1.34 -3.03 2.56 1.00 1818 1180
## hispanTRUE -0.10 1.45 -3.18 2.77 1.00 1820 1308
## sigma_hispanTRUE -0.09 0.17 -0.44 0.24 1.00 2803 2277
## sigma_femaleTRUE -0.21 0.11 -0.43 0.00 1.00 2035 2636
## sage:educ1yearofcollege_1 3.09 5.89 -9.10 15.33 1.00 2390 2177
## sage:educ2yearsofcollege_1 0.26 4.83 -10.49 9.65 1.00 2201 1749
## sage:educ4yearsofcollege_1 2.35 5.25 -9.67 11.55 1.01 1468 1239
## sage:educ5Pyearsofcollege_1 2.74 4.68 -7.73 12.21 1.00 2654 2229
## sage:educhighschoolorless_1 -0.18 3.90 -9.18 6.73 1.00 2214 1854
## sigma_sage_1 0.53 2.62 -4.32 6.49 1.00 1967 2108
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_1)
Many large sd estimates for the random intercepts but most CI bounds, especially for interactions, either include 0 or approach it, indicating that the data are not large enough to estimate interaction effects.
brm_2 <- brm(bf(
wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
family = gaussian(), # default
iter = 2000, # default
chains = 4, # default
control = list(adapt_delta = 0.8), # default
seed = 14,
file = "data/brms/wordsum/sb/brm_2",
data = sb_data) %>%
add_criterion("loo")
brm_2
## Warning: There were 47 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 1.22 1.01 0.05 3.71 1.00 1322 2045
## sds(sigma_sage_1) 0.63 0.58 0.02 2.13 1.00 1306 1634
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.56 0.37 0.15 1.52 1.00 1497 1292
## sd(sigma_Intercept) 0.09 0.09 0.00 0.32 1.00 1985 2310
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.55 0.35 0.14 1.49 1.00 1449 1507
## sd(sigma_Intercept) 0.06 0.08 0.00 0.25 1.00 2382 2210
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.18 0.13 0.01 0.48 1.00 1244 1641
## sd(sigma_Intercept) 0.06 0.05 0.00 0.19 1.00 1873 1847
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.74 0.69 0.12 2.67 1.00 1621 1993
## sd(sigma_Intercept) 0.51 0.49 0.10 1.90 1.00 1294 1234
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.39 0.33 0.01 1.23 1.01 858 1419
## sd(sigma_Intercept) 0.12 0.12 0.00 0.42 1.00 1583 1687
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.35 0.35 0.02 1.27 1.00 1157 1486
## sd(sigma_Intercept) 0.13 0.16 0.00 0.56 1.00 1164 1778
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.12 0.10 0.00 0.36 1.00 1893 1829
## sd(sigma_Intercept) 0.08 0.06 0.00 0.23 1.00 1320 1922
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.10 0.00 0.39 1.00 1844 2167
## sd(sigma_Intercept) 0.08 0.06 0.00 0.24 1.00 1508 1894
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.41 0.72 5.87 8.81 1.00 1810 2104
## sigma_Intercept 0.50 0.39 -0.29 1.39 1.00 1174 957
## hispanTRUE -0.29 0.21 -0.73 0.13 1.00 4984 2539
## femaleTRUE -0.11 0.16 -0.43 0.21 1.00 5674 2963
## sigma_hispanTRUE 0.03 0.11 -0.18 0.24 1.00 5495 2900
## sigma_femaleTRUE -0.11 0.08 -0.26 0.03 1.00 5576 2875
## sage_1 0.03 3.37 -8.40 4.94 1.00 1676 1315
## sigma_sage_1 -0.08 1.55 -3.04 3.58 1.00 2166 1738
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_2)
loo_compare(brm_1, brm_2)
## elpd_diff se_diff
## brm_2 0.0 0.0
## brm_1 -12.8 7.4
The model without interactions fits slightly better, probably because any predictions based on the interactions are way too uncertain to make a difference in average prediction accuracy. The divergent transitions are concerning, but most of them are gone when modelling the outcome as ordinal or when boosting the adapt_delta parameter.
brm_2_boosted <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
iter = 5000,
control = list(adapt_delta = 0.999),
file = "data/brms/wordsum/sb/brm_2_boosted",
data = sb_data) %>%
add_criterion("loo")
brm_2_boosted
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 5000; warmup = 2500; thin = 1;
## total post-warmup draws = 10000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 1.18 1.04 0.04 3.81 1.00 3265 4625
## sds(sigma_sage_1) 0.64 0.60 0.02 2.22 1.00 2566 3452
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.54 0.35 0.13 1.49 1.00 3369 3707
## sd(sigma_Intercept) 0.09 0.10 0.00 0.32 1.00 4610 4868
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.55 0.36 0.13 1.48 1.00 3394 4123
## sd(sigma_Intercept) 0.06 0.08 0.00 0.26 1.00 5770 5598
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.18 0.13 0.01 0.47 1.00 3440 4757
## sd(sigma_Intercept) 0.06 0.05 0.00 0.18 1.00 4593 4442
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.73 0.70 0.13 2.66 1.00 3859 5866
## sd(sigma_Intercept) 0.53 0.56 0.10 2.16 1.00 3222 5312
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.40 0.34 0.02 1.24 1.00 2297 3554
## sd(sigma_Intercept) 0.11 0.11 0.00 0.40 1.00 3996 4522
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.34 0.34 0.02 1.22 1.00 3739 4318
## sd(sigma_Intercept) 0.13 0.17 0.00 0.57 1.00 4046 5610
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.11 0.09 0.00 0.35 1.00 4933 4409
## sd(sigma_Intercept) 0.08 0.06 0.00 0.23 1.00 3056 3916
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.11 0.00 0.39 1.00 4399 5168
## sd(sigma_Intercept) 0.08 0.06 0.00 0.24 1.00 2967 3920
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.43 0.73 5.97 8.93 1.00 4622 4784
## sigma_Intercept 0.49 0.42 -0.38 1.37 1.00 4041 3735
## hispanTRUE -0.29 0.22 -0.72 0.14 1.00 12406 7673
## femaleTRUE -0.11 0.16 -0.43 0.21 1.00 11217 7293
## sigma_hispanTRUE 0.03 0.11 -0.18 0.24 1.00 12968 7709
## sigma_femaleTRUE -0.11 0.08 -0.26 0.04 1.00 11731 6726
## sage_1 0.13 3.44 -9.04 4.88 1.00 4346 4132
## sigma_sage_1 -0.10 1.66 -3.26 3.73 1.00 4384 4082
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_2_boosted)
loo_compare(brm_2, brm_2_boosted)
## elpd_diff se_diff
## brm_2 0.0 0.0
## brm_2_boosted -0.8 0.3
No difference between the model with 47/4000 divergent transitions and the same model with no divergent transitions.
priors <-
prior(normal(0,1), class = "b") +
prior(normal(0,1), class = "Intercept") +
prior(normal(0,1), class = "sd") +
prior(normal(0,1), class = "sds")
brm_2_priors <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
prior = priors,
file = "data/brms/wordsum/sb/brm_2_priors",
data = sb_data) %>%
add_criterion("loo")
brm_2_priors
## Warning: There were 6 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.74 0.45 0.06 1.76 1.00 1948 1482
## sds(sigma_sage_1) 0.60 0.55 0.02 2.03 1.01 1173 1825
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.54 0.36 0.13 1.47 1.00 1305 1985
## sd(sigma_Intercept) 0.09 0.09 0.00 0.32 1.00 1747 1810
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.62 0.45 0.16 1.91 1.01 521 285
## sd(sigma_Intercept) 0.07 0.09 0.00 0.25 1.00 2387 2474
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.18 0.12 0.01 0.45 1.00 1462 2045
## sd(sigma_Intercept) 0.06 0.05 0.00 0.18 1.00 1745 1995
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 2.60 0.77 0.60 3.95 1.01 432 158
## sd(sigma_Intercept) 0.52 0.53 0.10 1.97 1.00 1375 1562
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.41 0.38 0.02 1.34 1.00 803 1187
## sd(sigma_Intercept) 0.12 0.12 0.00 0.44 1.00 1866 2286
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.42 0.46 0.03 1.86 1.00 708 808
## sd(sigma_Intercept) 0.13 0.16 0.00 0.53 1.00 1600 2061
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.11 0.09 0.00 0.34 1.00 2158 1810
## sd(sigma_Intercept) 0.08 0.06 0.00 0.22 1.00 1662 2029
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.10 0.00 0.38 1.00 1572 1581
## sd(sigma_Intercept) 0.08 0.06 0.00 0.24 1.00 1479 1874
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 2.18 1.05 0.14 4.21 1.00 2267 2162
## sigma_Intercept 0.47 0.41 -0.39 1.29 1.00 1289 1186
## hispanTRUE -0.28 0.21 -0.71 0.14 1.00 6065 3060
## femaleTRUE -0.10 0.16 -0.41 0.21 1.00 4406 3269
## sigma_hispanTRUE 0.03 0.11 -0.18 0.25 1.00 5592 2978
## sigma_femaleTRUE -0.11 0.08 -0.26 0.04 1.00 5284 2961
## sage_1 0.45 0.96 -1.56 2.18 1.00 2705 2881
## sigma_sage_1 -0.16 1.50 -3.12 3.31 1.00 2491 1962
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_2_priors)
loo_compare(brm_2, brm_2_priors)
## elpd_diff se_diff
## brm_2 0.0 0.0
## brm_2_priors -0.3 0.7
Divergent transitions reduced and some effect estimates changed, but no change in prediction accuracy.
brm_3 <- brm(bf(wordsum | cens(censoring) ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
file = "data/brms/wordsum/sb/brm_3",
data = sb_data) %>%
add_criterion("loo")
brm_3
## Warning: There were 38 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum | cens(censoring) ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 1.04 0.91 0.04 3.34 1.00 2095 2201
## sds(sigma_sage_1) 0.86 0.72 0.03 2.68 1.00 1139 1981
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.61 0.39 0.16 1.59 1.00 1644 1976
## sd(sigma_Intercept) 0.09 0.11 0.00 0.36 1.00 2054 2337
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.64 0.43 0.20 1.65 1.00 1419 2295
## sd(sigma_Intercept) 0.10 0.11 0.00 0.38 1.00 1639 2133
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.21 0.14 0.01 0.55 1.00 1101 1947
## sd(sigma_Intercept) 0.09 0.06 0.00 0.23 1.00 1451 1824
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.71 0.68 0.09 2.62 1.00 1518 1568
## sd(sigma_Intercept) 0.55 0.51 0.11 2.02 1.00 1913 2206
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.50 0.39 0.03 1.46 1.00 1277 1623
## sd(sigma_Intercept) 0.17 0.15 0.01 0.55 1.00 1276 1705
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.30 0.32 0.01 1.17 1.00 1377 1798
## sd(sigma_Intercept) 0.14 0.18 0.00 0.63 1.00 1650 1880
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.12 0.10 0.00 0.37 1.00 2083 2200
## sd(sigma_Intercept) 0.16 0.08 0.02 0.35 1.00 1281 952
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.15 0.12 0.01 0.44 1.00 1810 2395
## sd(sigma_Intercept) 0.18 0.10 0.03 0.42 1.00 1137 1310
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.48 0.74 5.96 8.95 1.00 2118 2214
## sigma_Intercept 0.60 0.42 -0.26 1.51 1.00 1770 1517
## hispanTRUE -0.25 0.23 -0.69 0.20 1.00 5296 2988
## femaleTRUE -0.16 0.17 -0.50 0.17 1.00 4249 2906
## sigma_hispanTRUE 0.01 0.12 -0.21 0.24 1.00 5020 2543
## sigma_femaleTRUE -0.17 0.08 -0.32 -0.01 1.00 5658 3332
## sage_1 0.80 2.95 -6.59 5.29 1.00 2005 2327
## sigma_sage_1 0.31 1.95 -3.63 4.63 1.00 2139 1932
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_3)
## Warning: Censored responses are not shown in 'pp_check'.
loo_compare(brm_1, brm_2, brm_3)
## Warning: Not all models have the same y variable. ('yhash' attributes do not match)
## elpd_diff se_diff
## brm_2 0.0 0.0
## brm_3 -0.3 4.2
## brm_1 -12.8 7.4
The censoring does not improve model fit. This model results in slightly larger sds but they’re also associated with larger errors.
brm_4 <- brm(bf(wordsum_ordinal ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
disc ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
family = cumulative(),
file = "data/brms/wordsum/sb/brm_4",
data = sb_data) %>%
add_criterion("loo")
brm_4
## Warning: There were 3 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: cumulative
## Links: mu = logit; disc = log
## Formula: wordsum_ordinal ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## disc ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 1.51 1.46 0.04 5.26 1.00 2077 2214
## sds(disc_sage_1) 0.78 0.67 0.03 2.53 1.00 1785 2559
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.14 0.77 0.27 3.07 1.00 1594 1964
## sd(disc_Intercept) 0.10 0.11 0.00 0.38 1.00 2324 2503
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.24 0.85 0.33 3.44 1.00 1584 2237
## sd(disc_Intercept) 0.12 0.14 0.00 0.44 1.00 1742 2092
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.44 0.31 0.03 1.20 1.00 1217 1926
## sd(disc_Intercept) 0.09 0.07 0.00 0.25 1.01 1649 2407
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.35 1.31 0.13 4.86 1.00 1783 1809
## sd(disc_Intercept) 0.60 0.57 0.12 2.18 1.00 2298 2636
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.86 0.76 0.05 2.73 1.00 1269 1736
## sd(disc_Intercept) 0.18 0.17 0.01 0.61 1.00 1648 2201
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.52 0.60 0.02 2.10 1.00 1467 2016
## sd(disc_Intercept) 0.16 0.19 0.00 0.66 1.00 1831 2069
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.24 0.21 0.01 0.78 1.00 1974 2386
## sd(disc_Intercept) 0.16 0.09 0.01 0.36 1.00 1242 1967
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.26 0.23 0.01 0.85 1.00 1610 2369
## sd(disc_Intercept) 0.22 0.12 0.03 0.50 1.00 1257 1588
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept[1] -12.49 4.52 -23.24 -5.92 1.00 1283 1338
## Intercept[2] -9.88 3.36 -18.14 -4.73 1.00 1245 1692
## Intercept[3] -7.74 2.67 -14.12 -3.60 1.00 1217 1474
## Intercept[4] -6.58 2.35 -12.07 -2.79 1.00 1268 1603
## Intercept[5] -4.39 1.87 -8.65 -1.17 1.00 1451 1636
## Intercept[6] -1.89 1.45 -4.85 0.96 1.00 2015 2100
## Intercept[7] 0.05 1.32 -2.23 2.95 1.00 3029 2936
## Intercept[8] 1.54 1.35 -0.62 4.62 1.00 3408 3573
## Intercept[9] 3.42 1.54 1.02 7.00 1.00 2388 3286
## Intercept[10] 6.15 2.03 2.93 10.83 1.00 1616 2540
## disc_Intercept -0.56 0.46 -1.44 0.37 1.00 2083 2536
## hispanTRUE -0.46 0.45 -1.42 0.37 1.00 4027 2468
## femaleTRUE -0.28 0.33 -1.00 0.35 1.00 3558 2383
## disc_hispanTRUE -0.01 0.14 -0.28 0.25 1.00 7278 2903
## disc_femaleTRUE 0.19 0.10 -0.00 0.38 1.00 7570 2777
## sage_1 2.05 5.44 -12.13 10.34 1.00 1895 1792
## disc_sage_1 -0.01 1.91 -4.53 3.47 1.00 3143 2227
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_4)
loo_compare(brm_1, brm_2, brm_3, brm_4)
## Warning: Not all models have the same y variable. ('yhash' attributes do not match)
## elpd_diff se_diff
## brm_4 0.0 0.0
## brm_2 -19.0 5.8
## brm_3 -19.3 3.9
## brm_1 -31.8 8.8
The ordinal model fits the data better and has only 3 divergent transitions, probably because it accounts for the discrete and bounded nature of the outcome, but we stick with normal because it does not matter for the final RPP results (see below).
brm_5 <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
shape ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
family = negbinomial(),
file = "data/brms/wordsum/sb/brm_5",
data = sb_data) %>%
add_criterion("loo")
brm_5
## Warning: Parts of the model have not converged (some Rhats are > 1.05). Be careful when analysing the results! We
## recommend running more iterations and/or setting stronger priors.
## Family: negbinomial
## Links: mu = log; shape = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## shape ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.08 0.07 0.01 0.21 3.33 4 11
## sds(shape_sage_1) 2.78 2.22 0.25 6.08 3.29 4 16
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.05 0.03 0.01 0.10 2.77 5 39
## sd(shape_Intercept) 2.19 0.56 1.51 3.06 3.19 5 13
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.06 0.01 0.17 3.32 4 11
## sd(shape_Intercept) 1.72 0.98 0.27 3.20 2.88 5 12
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.01 0.00 0.04 3.35 4 11
## sd(shape_Intercept) 0.67 0.50 0.17 1.49 3.21 4 12
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.08 0.02 0.04 0.12 3.25 4 16
## sd(shape_Intercept) 0.83 0.52 0.12 1.57 2.70 5 17
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.01 0.02 0.05 3.17 5 11
## sd(shape_Intercept) 1.82 1.03 0.55 3.46 2.92 5 19
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.03 0.02 0.00 0.06 2.93 5 29
## sd(shape_Intercept) 2.42 0.91 1.17 3.77 3.12 5 15
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.01 0.00 0.05 3.11 5 13
## sd(shape_Intercept) 1.56 1.39 0.22 3.95 4.22 4 12
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.02 0.01 0.05 2.55 5 18
## sd(shape_Intercept) 1.39 1.16 0.19 3.42 4.09 4 11
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 2.02 0.05 1.95 2.11 3.24 4 11
## shape_Intercept 16.80 5.18 9.15 22.52 3.35 4 16
## hispanTRUE -0.13 0.06 -0.24 -0.06 3.08 5 12
## femaleTRUE -0.03 0.03 -0.11 0.00 3.26 4 11
## shape_hispanTRUE -1.57 3.64 -6.27 4.68 3.17 4 23
## shape_femaleTRUE -0.57 4.53 -5.19 7.55 3.09 5 18
## sage_1 0.43 0.24 -0.09 0.65 2.71 5 21
## shape_sage_1 2.87 8.74 -10.12 16.35 2.75 5 15
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Does not converge.
brm_5_v2 <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)),
seed = 14,
chains = 4,
family = negbinomial(),
file = "data/brms/wordsum/sb/brm_5_v2",
data = sb_data) %>%
add_criterion("loo")
brm_5_v2
## Warning: Parts of the model have not converged (some Rhats are > 1.05). Be careful when analysing the results! We
## recommend running more iterations and/or setting stronger priors.
## Warning: There were 110 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: negbinomial
## Links: mu = log; shape = identity
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.22 0.05 0.17 0.30 Inf 4 NA
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.05 0.00 0.14 Inf 4 NA
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.07 0.05 0.03 0.15 Inf 4 NA
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.01 0.00 0.03 Inf 4 NA
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.03 0.04 0.11 Inf 4 NA
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.01 0.01 0.00 0.02 Inf 4 NA
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.08 0.10 0.00 0.26 Inf 4 NA
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.02 0.01 0.04 Inf 4 NA
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.03 0.02 0.02 0.06 Inf 4 NA
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 1.99 0.11 1.83 2.12 Inf 4 NA
## hispanTRUE -0.04 0.04 -0.09 0.00 Inf 4 NA
## femaleTRUE -0.00 0.04 -0.05 0.04 Inf 4 NA
## sage_1 -0.27 0.44 -0.81 0.41 Inf 4 NA
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 2927981250000.00 3338800168506.79 195235000000.00 8643200000000.00 Inf 4 NA
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
brm_5_more_iters <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
shape ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
iter = 5000,
family = negbinomial(),
file = "data/brms/wordsum/sb/brm_5_more_iters",
data = sb_data) %>%
add_criterion("loo")
brm_5_more_iters
## Warning: Parts of the model have not converged (some Rhats are > 1.05). Be careful when analysing the results! We
## recommend running more iterations and/or setting stronger priors.
## Family: negbinomial
## Links: mu = log; shape = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## shape ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 5000; warmup = 2500; thin = 1;
## total post-warmup draws = 10000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.12 0.11 0.01 0.32 2.59 5 11
## sds(shape_sage_1) 3.06 2.26 0.25 7.20 2.44 5 14
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.03 0.02 0.00 0.06 3.57 4 18
## sd(shape_Intercept) 1.70 0.93 0.07 3.05 2.42 5 14
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.06 0.01 0.19 3.56 4 14
## sd(shape_Intercept) 1.43 0.85 0.28 3.06 4.14 4 13
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.01 0.00 0.03 2.81 5 26
## sd(shape_Intercept) 0.82 0.90 0.08 3.63 2.61 5 11
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.10 0.07 0.04 0.24 4.05 4 11
## sd(shape_Intercept) 1.25 0.85 0.11 3.28 2.39 5 11
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.03 0.01 0.02 0.05 2.65 5 11
## sd(shape_Intercept) 1.62 1.13 0.42 3.53 3.08 5 19
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.04 0.02 0.00 0.07 2.91 5 11
## sd(shape_Intercept) 2.55 1.93 0.38 6.84 3.73 4 11
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.02 0.00 0.05 3.67 4 11
## sd(shape_Intercept) 1.14 1.20 0.20 3.32 2.45 5 12
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.03 0.01 0.01 0.05 3.09 5 11
## sd(shape_Intercept) 1.57 1.34 0.17 3.38 2.94 5 11
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 2.01 0.07 1.91 2.10 2.59 5 12
## shape_Intercept 16.14 4.50 9.17 21.75 2.92 5 11
## hispanTRUE -0.05 0.05 -0.14 0.01 2.03 5 13
## femaleTRUE -0.00 0.03 -0.07 0.05 2.59 5 19
## shape_hispanTRUE -0.99 4.74 -6.48 7.03 2.91 5 41
## shape_femaleTRUE 2.58 5.22 -4.50 10.04 2.72 5 15
## sage_1 0.45 0.28 -0.12 0.80 3.44 4 11
## shape_sage_1 1.06 6.86 -10.47 10.38 2.88 5 13
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Failed to converge even with more iterations.
brm_6 <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)),
seed = 14,
chains = 4,
iter = 2000,
family = poisson(),
file = "data/brms/wordsum/sb/brm_6",
data = sb_data) %>%
add_criterion("loo")
brm_6
## Warning: There were 77 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: poisson
## Links: mu = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.22 0.21 0.01 0.83 1.00 907 825
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.06 0.00 0.21 1.00 1169 1487
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.05 0.06 0.00 0.22 1.00 999 1648
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.02 0.00 0.07 1.00 1485 1912
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.15 0.19 0.01 0.79 1.00 532 259
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.04 0.04 0.00 0.16 1.00 830 1333
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.05 0.06 0.00 0.22 1.01 1329 2293
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.02 0.00 0.07 1.00 1127 720
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.02 0.02 0.00 0.08 1.00 1944 1915
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 2.00 0.13 1.71 2.27 1.01 666 263
## hispanTRUE -0.04 0.05 -0.14 0.07 1.00 864 334
## femaleTRUE -0.01 0.03 -0.08 0.06 1.00 4606 2820
## sage_1 0.16 0.58 -1.25 1.24 1.00 1100 712
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
loo_compare(brm_2, brm_4, brm_6)
## Warning: Not all models have the same y variable. ('yhash' attributes do not match)
## elpd_diff se_diff
## brm_4 0.0 0.0
## brm_2 -19.0 5.8
## brm_6 -124.8 10.9
Bad fit compared to normal and ordinal models.
brm_7 <- brm(bf(wordsum | trials(10) ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)),
family = binomial(),
seed = 14,
chains = 4,
iter = 2000,
file = "data/brms/wordsum/sb/brm_7",
data = sb_data) %>%
add_criterion("loo")
pp_check(brm_7)
brm_7
## Warning: There were 41 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: binomial
## Links: mu = logit
## Formula: wordsum | trials(10) ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.95 0.81 0.05 3.14 1.00 1122 2289
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.31 0.22 0.08 0.85 1.00 1409 1929
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.29 0.21 0.06 0.78 1.00 1189 1572
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.07 0.02 0.29 1.00 1145 1243
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.51 0.50 0.09 2.01 1.00 1138 1214
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.15 0.14 0.01 0.51 1.00 1182 1294
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.20 0.22 0.02 0.83 1.00 1057 1493
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.07 0.06 0.00 0.21 1.00 1690 2341
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.11 0.07 0.01 0.28 1.01 1261 1762
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 1.04 0.46 0.08 1.96 1.00 1511 1599
## hispanTRUE -0.12 0.10 -0.32 0.07 1.00 5931 3012
## femaleTRUE -0.04 0.07 -0.19 0.10 1.00 7558 3102
## sage_1 -0.46 2.15 -5.85 2.67 1.00 1539 1368
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
loo_compare(brm_2, brm_4, brm_7)
## Warning: Not all models have the same y variable. ('yhash' attributes do not match)
## elpd_diff se_diff
## brm_4 0.0 0.0
## brm_2 -19.0 5.8
## brm_7 -19.9 10.2
Binomial doesn’t seem to be too bad a fit.
library(tidyverse)
library(brms)
options(mc.cores = 20,
threads = threading(5, static = TRUE),
brms.backend = "cmdstanr")
sb_data <- readRDS("data/preprocessed/wordsum/sb_data.rds")
rstan::rstan_options(auto_write = TRUE)
sb_data <- sb_data %>% mutate(wordsum_prop = wordsum / 11) %>% filter(wordsum_prop != 0) # exclude one person with a score of 0
brm_8 <- brm(bf(wordsum_prop ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
phi ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)),
family = Beta(),
seed = 14,
chains = 4,
iter = 2000,
file = "data/brms/wordsum/sb/brm_8",
data = sb_data) %>%
add_criterion("loo")
brm_8
## Warning: There were 44 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: beta
## Links: mu = logit; phi = log
## Formula: wordsum_prop ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## phi ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 477)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 0.57 0.55 0.02 2.01 1.00 1387 2166
## sds(phi_sage_1) 0.92 0.83 0.05 3.19 1.00 1788 2577
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.25 0.18 0.05 0.73 1.00 1224 818
## sd(phi_Intercept) 0.17 0.18 0.01 0.61 1.00 2017 2063
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.23 0.16 0.06 0.64 1.00 1523 2130
## sd(phi_Intercept) 0.20 0.21 0.01 0.71 1.00 1443 2017
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.08 0.05 0.01 0.21 1.00 1053 1709
## sd(phi_Intercept) 0.12 0.09 0.01 0.34 1.00 1867 2221
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.36 0.38 0.05 1.48 1.00 1679 2437
## sd(phi_Intercept) 0.81 0.75 0.16 2.93 1.00 1883 2180
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.19 0.15 0.01 0.56 1.00 1163 1323
## sd(phi_Intercept) 0.23 0.22 0.01 0.82 1.00 1572 2318
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.15 0.00 0.52 1.00 1454 1838
## sd(phi_Intercept) 0.24 0.29 0.01 1.01 1.00 1727 2568
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.05 0.04 0.00 0.15 1.00 1846 2331
## sd(phi_Intercept) 0.17 0.12 0.01 0.45 1.00 1462 1987
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.05 0.00 0.17 1.00 1610 2264
## sd(phi_Intercept) 0.26 0.15 0.03 0.62 1.00 1209 1450
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 0.73 0.35 -0.03 1.40 1.00 1779 1771
## phi_Intercept 2.13 0.65 0.61 3.32 1.00 1850 1830
## hispanTRUE -0.08 0.08 -0.24 0.09 1.00 5119 2858
## femaleTRUE -0.06 0.06 -0.18 0.07 1.00 5449 3078
## phi_hispanTRUE 0.14 0.20 -0.26 0.51 1.00 5334 2730
## phi_femaleTRUE 0.20 0.15 -0.10 0.49 1.00 4912 2298
## sage_1 0.09 1.49 -3.74 2.30 1.00 1795 1432
## phi_sage_1 0.63 2.45 -4.77 6.14 1.00 2278 1869
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
pp_check(brm_8)
Can’t loo_compare because we’re missing one data point, but pp_check doesn’t look too bad.
# brm_9 <- brm(bf(wordsum | trials(10) ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)),
# # phi ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets))
# family = beta_binomial(),
# seed = 14,
# chains = 4,
# iter = 2000,
# file = "data/brms/wordsum/sb/brm_9",
# data = sb_data) %>%
# add_criterion("loo")
#
# pp_check(brm_9)
# brm_9
# loo_compare(brm_2, brm_4, brm_9)
Took too long and kept crashing.
brm_2
## Warning: There were 47 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data (Number of observations: 478)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 1.22 1.01 0.05 3.71 1.00 1322 2045
## sds(sigma_sage_1) 0.63 0.58 0.02 2.13 1.00 1306 1634
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.56 0.37 0.15 1.52 1.00 1497 1292
## sd(sigma_Intercept) 0.09 0.09 0.00 0.32 1.00 1985 2310
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.55 0.35 0.14 1.49 1.00 1449 1507
## sd(sigma_Intercept) 0.06 0.08 0.00 0.25 1.00 2382 2210
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.18 0.13 0.01 0.48 1.00 1244 1641
## sd(sigma_Intercept) 0.06 0.05 0.00 0.19 1.00 1873 1847
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.74 0.69 0.12 2.67 1.00 1621 1993
## sd(sigma_Intercept) 0.51 0.49 0.10 1.90 1.00 1294 1234
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.39 0.33 0.01 1.23 1.01 858 1419
## sd(sigma_Intercept) 0.12 0.12 0.00 0.42 1.00 1583 1687
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.35 0.35 0.02 1.27 1.00 1157 1486
## sd(sigma_Intercept) 0.13 0.16 0.00 0.56 1.00 1164 1778
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.12 0.10 0.00 0.36 1.00 1893 1829
## sd(sigma_Intercept) 0.08 0.06 0.00 0.23 1.00 1320 1922
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.13 0.10 0.00 0.39 1.00 1844 2167
## sd(sigma_Intercept) 0.08 0.06 0.00 0.24 1.00 1508 1894
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.41 0.72 5.87 8.81 1.00 1810 2104
## sigma_Intercept 0.50 0.39 -0.29 1.39 1.00 1174 957
## hispanTRUE -0.29 0.21 -0.73 0.13 1.00 4984 2539
## femaleTRUE -0.11 0.16 -0.43 0.21 1.00 5674 2963
## sigma_hispanTRUE 0.03 0.11 -0.18 0.24 1.00 5495 2900
## sigma_femaleTRUE -0.11 0.08 -0.26 0.03 1.00 5576 2875
## sage_1 0.03 3.37 -8.40 4.94 1.00 1676 1315
## sigma_sage_1 -0.08 1.55 -3.04 3.58 1.00 2166 1738
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
We can conclude that the 4 chains converged since the
Rhat
s are consistently \(<=1.01\). We see that the sd
Estimate
s of the random intercepts for \(\sigma\) are pretty small so we don’t
expect large variations in the residual variance depending on the
predictors. Larger sds can be observed for predicting \mu
(i.e., the actual Wordsum Scores). Males scored on average 0.11 higher
on Wordsum than females.
pp_check(brm_2)
t <- sb_data %>% mutate(age_group = if_else(age > 40, "> 40", "=< 40"))
yrep <- posterior_predict(brm_2, newdata = t, ndraws = 1000)
p <- ppc_pit_ecdf_grouped(t$wordsum, yrep, group = as.factor(t$age_group), prob = 0.99, plot_diff = F)
p
Some deviations at the right tail but not too bad.
loo_R2(brm_2)
## Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
## Estimate Est.Error Q2.5 Q97.5
## R2 0.06907 0.02881 0.01182 0.1243
The included predictors do not explain a lot of variance (~7%), so not a lot of adjustment can happen even if there are differences between the sample and the population.
This is the P part of RPP.
set.seed(810)
sim_pop_sample <- census_sb %>%
sample_n(size = 100000,
weight = census_n,
replace = TRUE)
Now that we have a post-stratified sample, we can use
tidybayes’s add_predicted_draws()
function to draw
1000 samples (i.e., Wordsum Scores) from the posterior for each one of
the 100000 simulated participants in sim_pop_sample
:
sim_pop_sample_with_draws <- sim_pop_sample %>%
add_predicted_draws(brm_2,
ndraws = 1000,
seed = 810,
allow_new_levels = TRUE) %>%
mutate(.prediction = round(pmax(0, pmin(10, .prediction))))
data.table::fwrite(sim_pop_sample_with_draws, "data/results/wordsum/sim_pop_sample_with_draws.csv.gz")
This yields a dataframe in long format with 100000 (number of
simulated participants) * 1000 (number of draws from the posterior) =
100 million rows. allow_new_levels = TRUE
is necessary for
estimating the outcome for combinations of the 11 predictors which do
not occur in the actual TL sample. This is inevitable in detailed enough
poststratification tables and was the case here as the
poststratification table we used had over 3 million subcategories/rows.
The mutate()
call serves to set all predicted CFT 20-R 20-R
score scores which go below or above the scale limits (0 and 10,
respectively), to those scale limits.
Here we aggregate the intelligence scores by age and compute the RPP corrected means and SDs (+ their SEs) for each age group:
sim_pop_sample_with_draws <- data.table::fread("data/results/wordsum/sim_pop_sample_with_draws.csv.gz")
means_sds_and_ses_RPP <- sim_pop_sample_with_draws %>%
group_by(age, .draw) %>%
summarise(mean_prediction = mean(.prediction),
sd_prediction = sd(.prediction)) %>%
group_by(age) %>%
summarise(RPP_mean = mean(mean_prediction),
RPP_se_of_mean = sd(mean_prediction),
RPP_sd = sqrt(mean(sd_prediction^2)),
RPP_se_of_sd = sd(sd_prediction))
means_sds_and_ses_RPP %>% head(14) %>% kable(digits = 2) %>% kable_styling(full_width = FALSE)
age | RPP_mean | RPP_se_of_mean | RPP_sd | RPP_se_of_sd |
---|---|---|---|---|
21 | 6.62 | 0.21 | 1.66 | 0.15 |
22 | 6.66 | 0.20 | 1.68 | 0.14 |
23 | 6.70 | 0.18 | 1.70 | 0.13 |
24 | 6.74 | 0.17 | 1.71 | 0.12 |
25 | 6.76 | 0.16 | 1.73 | 0.11 |
26 | 6.76 | 0.16 | 1.76 | 0.11 |
27 | 6.75 | 0.15 | 1.78 | 0.11 |
28 | 6.78 | 0.15 | 1.78 | 0.10 |
29 | 6.77 | 0.15 | 1.82 | 0.11 |
30 | 6.79 | 0.14 | 1.83 | 0.11 |
31 | 6.79 | 0.14 | 1.84 | 0.10 |
32 | 6.81 | 0.14 | 1.85 | 0.10 |
33 | 6.80 | 0.15 | 1.85 | 0.10 |
34 | 6.83 | 0.14 | 1.84 | 0.10 |
This code chunk first aggregates by age and posterior draw (1000) to compute the mean and SD of Wordsum estimates across the 100000 simulated participants. Then it computes the means and SDs across the draws (which form the RPP means and SDs for each age group) and the SDs of the means and SDs calculated in the previous step, hence yielding the Bayesian SEs of the RPP means and SDs, respectively.
Here we calculate linearly transformed IQs for all cft scores that occur in the poststratified sample.
iq <- function(cft_score, mean, sd) {
iq_score <- ((cft_score - mean) / sd) * 15 + 100
return(iq_score)
}
t <- function(raw_score, mean, sd) {
t_score_value <- ((raw_score - mean) / sd) * 10 + 50
return(t_score_value)
}
iqs_linear <- sim_pop_sample_with_draws %>%
left_join(select(means_sds_and_ses_RPP, c(RPP_mean, RPP_sd, age)), by = "age") %>%
group_by(age) %>%
mutate(wordsum_score = .prediction,
RPP_IQ = iq(wordsum_score, RPP_mean, RPP_sd),
RPP_T = t(wordsum_score, RPP_mean, RPP_sd)) %>%
group_by(age, wordsum_score) %>%
summarise(RPP_IQ = round(mean(iq_score)),
RPP_T = round(mean(T_score)))
## Error in `summarise()`:
## ℹ In argument: `RPP_IQ = round(mean(iq_score))`.
## ℹ In group 1: `age = 21` and `wordsum_score = 0`.
## Caused by error:
## ! object 'iq_score' not found
iqs_linear %>% head(14) %>% kable %>% kable_styling(full_width = FALSE)
## Error in eval(expr, envir, enclos): object 'iqs_linear' not found
Due to the left skewed distributions linear transformation probably underestimates IQs at the lower tail.
iqs_normalised <- sim_pop_sample_with_draws %>%
mutate(wordsum_score = as.numeric(as.character(.prediction))) %>%
group_by(age, .draw) %>%
mutate(n = n(),
normal_transformed_score = qnorm((rank(wordsum_score) - 0.5) / n)) %>%
mutate(iq_score = normal_transformed_score * 15 + 100,
T_score = normal_transformed_score * 10 + 50) %>%
group_by(age, wordsum_score) %>%
summarise(RPP_IQ = round(mean(iq_score)),
RPP_T = round(mean(T_score)))
iqs_normalised %>% head(14) %>% kable %>% kable_styling(full_width = FALSE)
age | wordsum_score | RPP_IQ | RPP_T |
---|---|---|---|
21 | 0 | 53 | 19 |
21 | 1 | 58 | 22 |
21 | 2 | 64 | 26 |
21 | 3 | 70 | 30 |
21 | 4 | 78 | 35 |
21 | 5 | 86 | 40 |
21 | 6 | 94 | 46 |
21 | 7 | 103 | 52 |
21 | 8 | 112 | 58 |
21 | 9 | 120 | 64 |
21 | 10 | 130 | 70 |
22 | 0 | 54 | 19 |
22 | 1 | 59 | 22 |
22 | 2 | 64 | 26 |
Like above but without converting the cumulative distribution function values to standard Gaussian quantiles and then to IQs/Ts.
percentiles <- sim_pop_sample_with_draws %>%
mutate(wordsum_score = .prediction) %>%
group_by(age, .draw) %>%
mutate(n = n(),
percentile = (rank(wordsum_score) - 0.5) / n) %>%
group_by(age, wordsum_score) %>%
summarise(RPP_percentile = mean(percentile))
percentiles %>% head(14) %>% kable %>% kable_styling(full_width = FALSE)
age | wordsum_score | RPP_percentile |
---|---|---|
21 | 0 | 0.0012 |
21 | 1 | 0.0035 |
21 | 2 | 0.0096 |
21 | 3 | 0.0265 |
21 | 4 | 0.0712 |
21 | 5 | 0.1728 |
21 | 6 | 0.3528 |
21 | 7 | 0.5822 |
21 | 8 | 0.7847 |
21 | 9 | 0.9104 |
21 | 10 | 0.9747 |
22 | 0 | 0.0013 |
22 | 1 | 0.0038 |
22 | 2 | 0.0099 |
Here we use the home-made age_norm_comparisons
function
to do some robustness checks and other analyses.
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))) # for handling normal predictons (that go out of wordsum bounds)
)
sb_raw_vs_rpp_rp <- age_norm_comparisons(
brm_2,
ps_table = census_sb,
RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
sim_size = 100000,
prediction_transform = prediction_transform,
labels = c(labels = c("Raw", "RPP", "RP")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
# "#20854EFF",
# "#7876B1FF",
# "#6F99ADFF",
"#E18727FF"
# "#FFDC91FF",
# "#EE4C97FF"
),
output_file = "data/results/wordsum/sb_raw_vs_rpp_rp.rds"
)
sb_raw_vs_rpp_rp[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.01 0.113 1.75 0.0726 RPP_brm_2
## 3 7.25 0.0733 1.70 0.0514 RP_brm_2
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
As can be expected given the small associations between the adjustment variables and the outcome, most of the adjustment to the age group means were the result of the regularisation rather than the poststratification. Nevertheless, poststratification led to a downward adjustment of ~.3 points, indicating that Wordsum scores in this Prolific sample are higher than in the broader US population. The percentile plot also indicates that the poststratification introduced additional consistency in the estimates across age groups.
set.seed(810)
sim_norming_sample <- sb_data %>%
group_by(age, female, educ, income_brackets, race, hispan, region_residence, region_birth, degfield_branch, marst, occ_category) %>%
summarise(sample_n = n()) %>%
ungroup() %>%
sample_n(size = 100000,
weight = sample_n,
replace = TRUE)
sim_norming_sample_with_draws <- sim_norming_sample %>%
add_predicted_draws(brm_2,
ndraws = 1000,
seed = 810,
allow_new_levels = TRUE) %>%
mutate(.prediction = round(pmax(0, pmin(10, .prediction))))
RP_iqs_normalised <- sim_norming_sample_with_draws %>%
group_by(age, .draw) %>%
mutate(n = n(),
normal_transformed_score = qnorm((rank(.prediction) - 0.5) / n)) %>%
mutate(iq_score = normal_transformed_score * 15 + 100,
T_score = normal_transformed_score * 10 + 50) %>%
group_by(age, wordsum_score = .prediction) %>%
summarise(RP_IQ = round(mean(iq_score)),
RP_T = round(mean(T_score)))
raw_iqs_normalised <- sb_data %>%
group_by(age) %>%
mutate(n = n(),
normal_transformed_score = qnorm((rank(wordsum) - 0.5) / n)) %>%
mutate(iq_score = normal_transformed_score * 15 + 100,
T_score = normal_transformed_score * 10 + 50) %>%
group_by(age, wordsum_score = wordsum) %>%
summarise(Raw_IQ = round(mean(iq_score)),
Raw_T = round(mean(T_score)))
iqs <- iqs_normalised %>%
left_join(RP_iqs_normalised, by = c("age", "wordsum_score")) %>%
left_join(raw_iqs_normalised, by = c("age", "wordsum_score"))
mean(iqs$RPP_IQ-iqs$RP_IQ)
## [1] 2.029
mean(abs(iqs$RPP_IQ-iqs$RP_IQ))
## [1] 2.389
max(iqs$RPP_IQ-iqs$RP_IQ)
## [1] 11
mean(iqs$RPP_IQ-iqs$Raw_IQ, na.rm = T)
## [1] 1.736
mean(abs(iqs$RPP_IQ-iqs$Raw_IQ), na.rm = T)
## [1] 4.877
max(iqs$RPP_IQ-iqs$Raw_IQ, na.rm = T)
## [1] 20
RPP IQs were on average 1.6 IQ points larger than raw ones (absolute difference 4.9, max difference 20). Again illustrating that poststratification made an incremental difference beyond the regularisation, RPP IQs were almost 2 points larger than RP ones (max = 12).
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))), # for handling normal predictons
function(x) as.numeric(as.character(x)) # for handling ordinal predictions
)
sb_ord_nor <- age_norm_comparisons(
brm_2, brm_4,
ps_table = census_sb,
# RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
prediction_transform = prediction_transform,
sim_size = 100000,
labels = c(labels = c("Raw", "RPP, normal", "RPP, ordinal")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
"#20854EFF"
# "#7876B1FF",
# "#6F99ADFF"
# "#E18727FF"
# "#FFDC91FF",
# "#EE4C97FF"
),
output_file = "data/results/wordsum/sb_ord_nor.rds"
)
sb_ord_nor[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.01 0.113 1.75 0.0721 RPP_brm_2
## 3 6.98 0.126 1.83 0.0960 RPP_brm_4
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))), # for handling normal predictons
function(x) x # leave binomial predicitons as is
)
sb_nor_bin <- age_norm_comparisons(
brm_2, brm_7,
ps_table = census_sb,
# RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
prediction_transform = prediction_transform,
sim_size = 100000,
labels = c(labels = c("Raw", "RPP, normal", "RPP, binomial")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
"#20854EFF"
# "#7876B1FF",
# "#6F99ADFF"
# "#E18727FF"
# "#FFDC91FF",
# "#EE4C97FF"
),
output_file = "data/results/wordsum/sb_nor_bin.rds"
)
sb_nor_bin[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.01 0.113 1.75 0.0722 RPP_brm_2
## 3 7.03 0.107 1.60 0.0446 RPP_brm_7
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
Barely any difference in means, although ordinal predictions are somewhat more dispersed.
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))), # for handling normal predictons
function(x) round(x*11) # for handling beta predictions
)
sb_nor_beta <- age_norm_comparisons(
brm_2, brm_8,
ps_table = census_sb,
# RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
prediction_transform = prediction_transform,
sim_size = 100000,
labels = c(labels = c("Raw", "RPP, normal", "RPP, beta")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
"#20854EFF"
# "#7876B1FF",
# "#6F99ADFF"
# "#E18727FF"
# "#FFDC91FF",
# "#EE4C97FF"
),
output_file = "data/results/wordsum/sb_nor_beta.rds"
)
sb_nor_beta[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.01 0.113 1.75 0.0721 RPP_brm_2
## 3 7.05 0.119 1.76 0.0798 RPP_brm_8
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))), # for handling normal predictons
function(x) round(pmax(0, pmin(10, x))) # for handling normal predictions
)
sb_default_vs_boosted <- age_norm_comparisons(
brm_2, brm_2_boosted,
ps_table = census_sb,
# RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
prediction_transform = prediction_transform,
sim_size = 100000,
labels = c(labels = c("Raw", "RPP, normal", "RPP, boosted normal")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
"#20854EFF"
# "#7876B1FF",
# "#6F99ADFF"
# "#E18727FF"
# "#FFDC91FF",
# "#EE4C97FF"
),
output_file = "data/results/wordsum/sb_default_vs_boosted.rds"
)
sb_default_vs_boosted[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.01 0.113 1.74 0.0722 RPP_brm_2
## 3 7.00 0.115 1.75 0.0720 RPP_brm_2_boosted
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))), # for handling normal predictons
function(x) round(pmax(0, pmin(10, x))) # for handling normal predictions
)
sb_with_vs_without_ints <- age_norm_comparisons(
brm_2, brm_1,
ps_table = census_sb,
# RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
prediction_transform = prediction_transform,
sim_size = 100000,
labels = c(labels = c("Raw", "RPP, without ints", "RPP, with ints")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
"#20854EFF"
# "#7876B1FF",
# "#6F99ADFF"
# "#E18727FF"
# "#FFDC91FF",
# "#EE4C97FF"
),
output_file = "data/results/wordsum/sb_with_vs_without_ints.rds"
)
sb_with_vs_without_ints[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.01 0.112 1.75 0.0720 RPP_brm_2
## 3 7.00 0.131 1.88 0.0835 RPP_brm_1
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
Same here.
library(careless)
main_qs <- c("AAID", "PANAS", "PAQ", "PSS", "NEPS", "ULS", "FCV", "DAQ", "CESD", "HEXACO", "OCIR", "PTQ", "RAAS", "KSA", "SAS", "MFQ", "CQ")
sb_data_CR_cleaned <- sb_data %>%
filter(if_all(starts_with(main_qs), ~ !is.na(.x))) %>%
mutate(psychsyn = psychsyn(select(., starts_with(main_qs))),
psychant = psychant(select(., starts_with(main_qs))),
CR_psychsyn_outlier = psychsyn < 0.22,
CR_psychant_outlier = psychant > -0.03,
CR_mahal_outlier = mahad(select(., starts_with(main_qs)), flag = TRUE, confidence = .999, plot = F)$flagged,
CR_not_serious = ZY02 == "No, my responses should not be used.",
CR_knows_wordsum = MS04 == "Yes") %>%
filter(!if_any(starts_with("CR_"), ~ .x == TRUE))
Dropped 85 potentially careless respondents.
brm_2_non_CR <- brm(bf(wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets),
sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
),
seed = 14,
chains = 4,
file = "data/brms/wordsum/sb/brm_4_non_CR",
data = sb_data_CR_cleaned) %>%
add_criterion("loo")
brm_2_non_CR
## Warning: There were 95 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: wordsum ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## sigma ~ s(age) + (1 | educ) + (1 | race) + hispan + (1 | region_residence) + (1 | marst) + (1 | degfield_branch) + (1 | occ_category) + (1 | region_birth) + female + (1 | income_brackets)
## Data: sb_data_CR_cleaned (Number of observations: 393)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sds(sage_1) 1.07 0.96 0.04 3.62 1.00 1323 1685
## sds(sigma_sage_1) 0.81 0.76 0.04 2.96 1.01 790 460
##
## Multilevel Hyperparameters:
## ~degfield_branch (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.61 0.38 0.19 1.66 1.00 1675 2303
## sd(sigma_Intercept) 0.12 0.13 0.00 0.45 1.00 1409 2223
##
## ~educ (Number of levels: 5)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.64 0.40 0.16 1.66 1.00 1139 1083
## sd(sigma_Intercept) 0.08 0.10 0.00 0.31 1.00 1529 1921
##
## ~income_brackets (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.18 0.13 0.00 0.49 1.00 831 600
## sd(sigma_Intercept) 0.06 0.05 0.00 0.17 1.00 1775 1875
##
## ~marst (Number of levels: 3)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.57 0.60 0.04 2.22 1.00 1493 2010
## sd(sigma_Intercept) 0.51 0.57 0.09 2.15 1.00 786 496
##
## ~occ_category (Number of levels: 6)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.48 0.33 0.05 1.33 1.01 1043 1485
## sd(sigma_Intercept) 0.18 0.16 0.01 0.58 1.00 1070 1734
##
## ~race (Number of levels: 4)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.26 0.29 0.01 1.04 1.00 1422 1841
## sd(sigma_Intercept) 0.15 0.18 0.01 0.66 1.00 1511 1824
##
## ~region_birth (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.10 0.08 0.00 0.31 1.00 1792 1964
## sd(sigma_Intercept) 0.07 0.06 0.00 0.21 1.00 1801 2227
##
## ~region_residence (Number of levels: 9)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.11 0.00 0.41 1.00 1429 1018
## sd(sigma_Intercept) 0.15 0.10 0.01 0.37 1.00 959 1222
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.44 0.70 6.00 8.86 1.00 1843 2065
## sigma_Intercept 0.38 0.38 -0.40 1.21 1.00 1309 1106
## hispanTRUE -0.10 0.23 -0.57 0.35 1.00 3969 3044
## femaleTRUE -0.08 0.17 -0.41 0.25 1.00 3815 2700
## sigma_hispanTRUE -0.07 0.13 -0.32 0.19 1.00 3261 2870
## sigma_femaleTRUE -0.02 0.09 -0.20 0.16 1.00 3721 2342
## sage_1 0.66 2.97 -7.10 5.17 1.00 1511 1292
## sigma_sage_1 0.13 1.98 -3.89 5.06 1.01 755 498
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
prediction_transform = list(
function(x) round(pmax(0, pmin(10, x))),
function(x) round(pmax(0, pmin(10, x)))
)
sb_vs_non_CR <- age_norm_comparisons(
brm_2, brm_2_non_CR,
ps_table = census_sb,
# RP = c("census", "norming_sample"),
ps_variables = c("age", "female", "educ", "income_brackets", "race", "hispan", "region_residence", "region_birth", "degfield_branch", "marst", "occ_category"),
prediction_transform = prediction_transform,
sim_size = 100000,
labels = c(labels = c("Raw", "RPP, full data", "RPP, CR cleaned data")),
palette = c(
"#BC3C29FF",
"#0072B5FF",
# "#20854EFF"
# "#7876B1FF",
# "#6F99ADFF"
# "#E18727FF"
# "#FFDC91FF",
"#EE4C97FF"
),
output_file = "data/results/wordsum/sb_vs_non_CR.rds"
)
sb_vs_non_CR[-1]
## $overall_estimates
## # A tibble: 3 × 5
## Mean SE_of_Mean SD SE_of_SD Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 7.30 NA 1.71 NA Raw
## 2 7.00 0.113 1.75 0.0722 RPP_brm_2
## 3 7.06 0.125 1.69 0.0882 RPP_brm_2_non_CR
##
## $means_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
##
## $SDs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_pointrange()`).
## Warning: Removed 46 rows containing missing values or values outside the scale range (`geom_segment()`).
##
## $SEs_plot
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
##
## $percentile_plot
Negligible differences despite dropping 85/478 datapoints.
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] careless_1.2.2 rstan_2.32.6 StanHeaders_2.32.10 bayesplot_1.11.1 marginaleffects_0.24.0
## [6] tidybayes_3.0.7 brms_2.22.0 Rcpp_1.0.12 kableExtra_1.4.0 ggrepel_0.9.6
## [11] data.table_1.16.4 haven_2.5.4 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [16] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [21] ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] mnormt_2.1.1 gridExtra_2.3 inline_0.3.20 sandwich_3.1-1 readxl_1.4.3
## [6] rlang_1.1.4 magrittr_2.0.3 multcomp_1.4-26 matrixStats_1.5.0 compiler_4.4.1
## [11] mgcv_1.9-1 loo_2.8.0 reshape2_1.4.4 systemfonts_1.1.0 vctrs_0.6.5
## [16] pkgconfig_2.0.3 arrayhelpers_1.1-0 crayon_1.5.3 fastmap_1.2.0 backports_1.5.0
## [21] labeling_0.4.3 utf8_1.2.4 cmdstanr_0.8.0 rmarkdown_2.27 tzdb_0.4.0
## [26] ps_1.7.6 xfun_0.45 cachem_1.1.0 jsonlite_1.8.8 highr_0.11
## [31] psych_2.4.12 parallel_4.4.1 R6_2.5.1 bslib_0.7.0 stringi_1.8.4
## [36] jquerylib_0.1.4 cellranger_1.1.0 estimability_1.5.1 knitr_1.47 zoo_1.8-12
## [41] R.utils_2.12.3 Matrix_1.7-1 splines_4.4.1 timechange_0.3.0 tidyselect_1.2.1
## [46] rstudioapi_0.16.0 abind_1.4-8 yaml_2.3.8 codetools_0.2-20 processx_3.8.4
## [51] pkgbuild_1.4.4 plyr_1.8.9 lattice_0.22-6 withr_3.0.0 bridgesampling_1.1-2
## [56] posterior_1.6.0 coda_0.19-4.1 evaluate_0.24.0 survival_3.8-3 RcppParallel_5.1.9
## [61] ggdist_3.3.2 xml2_1.3.6 pillar_1.9.0 tensorA_0.36.2.1 checkmate_2.3.2
## [66] stats4_4.4.1 distributional_0.5.0 generics_0.1.3 hms_1.1.3 rstantools_2.4.0
## [71] munsell_0.5.1 scales_1.3.0 xtable_1.8-4 glue_1.7.0 emmeans_1.10.6
## [76] tools_4.4.1 mvtnorm_1.3-2 grid_4.4.1 QuickJSR_1.5.1 colorspace_2.1-1
## [81] nlme_3.1-166 cli_3.6.3 fansi_1.0.6 svUnit_1.0.6 viridisLite_0.4.2
## [86] svglite_2.1.3 Brobdingnag_1.2-9 gtable_0.3.6 R.methodsS3_1.8.2 sass_0.4.9
## [91] digest_0.6.36 TH.data_1.1-2 farver_2.1.2 R.oo_1.27.0 htmltools_0.5.8.1
## [96] lifecycle_1.0.4 MASS_7.3-64