knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE,
    include = TRUE,
    error = TRUE
)

knitr::opts_chunk$set(fig.ext = 'svg')


options(scipen = 999,
        digits = 4,
        width = 120)


library(tidyverse)
library(haven)
library(codebook)
library(kableExtra)

1 TwinLife sample

Note that you cannot run the code here unless you have requested and obtained the TwinLife data. You do not need to do this however, to reproduce the tutorial code in the RMarkdown document 02_tutorial.Rmd. The synthetic dataset we produce here is available on our OSF project and can be used for that purpose. The comparisons to the Raw estimates (Figures 3 and 4 in the manuscript, see document 03_norm_comparisons) can also be reproduced based on the synthetic dataset. The comparisons to the Manual norms cannot be made reproducible as we failed to obtain permission to share those values.

1.1 Preprocessing

tl_prelim <- 
  haven::read_dta("../unshareable_data/raw/ZA6701_person_wid1_v8-0-0.dta") %>% 
  select(c("wid", "fid", "pid", "ptyp", "age0100", "age0101", "sex", "mig0520", "mig2000", "mig3100", "mig3200", "edu0100", "edu0400", "eca0108", "igf0182", "igf0282", "igf0382", "igf0482", "eca0105", "inc0401", "eca0230")) %>%
  codebook::detect_missing(ninety_nine_problems = T) %>%
  filter(complete.cases(igf0182, igf0282, igf0382, igf0482)) %>%
  # total sum scores of the sum scores of all 4 subtests, automatically excludes invalid and NA on subtests
  mutate(cft = igf0182 + igf0282 + igf0382 + igf0482,
  # divide age in months variable by 12 for easier interpretablility
         age = age0101/12,
  # create logical sex variable
         male = sex == 1, 
  # create age groups corresponding to those in the manual
  age_group = case_when(
                         age0100 == 11 ~ '11',
                         age0100 == 12 ~ '12',
                         age0100 == 13 ~ '13',
                         age0100 == 14 ~ '14',
                         age0100 == 15 ~ '15',
                         age0100 == 16 ~ '16',
                         age0100 >= 17 & age0100 <= 19 ~ '17-19',
                         age0100 >= 20 & age0100 <= 24 ~ '20-24',
                         age0100 >= 25 & age0100 <= 29 ~ '25-29',
                         age0100 >= 30 & age0100 <= 34 ~ '30-34',
                         age0100 >= 35 & age0100 <= 39 ~ '35-39',
                         age0100 >= 40 & age0100 <= 44 ~ '40-44',
                         age0100 >= 45 & age0100 <= 49 ~ '45-49',
                         age0100 >= 50 & age0100 <= 54 ~ '50-54',
                         age0100 >= 55 & age0100 <= 59 ~ '55-59',
                         age0100 >= 60 & age0100 <= 64 ~ '60-64',
                         TRUE ~ NA_character_)) 

tl_w_mig <- tl_prelim %>%
  # since the migration variable in the census encodes information about both one's own and the parent's migration background and experience ("Migrationshintergrund und -erfahrung"), we start by creating a variable indicating whether the person and/or their parents is/are born abroad
  # assume German citizenship lacking information about citizenship
  mutate(mig0520 = coalesce(mig0520, 1), 
         born_abroad = case_when(
           # if the person themselves is not born in germany, then born_abroad = "self"
            mig2000 != 1 ~ 'self',
           # if the person and both their parents are born in germany, then born_abroad = "none"
            mig2000 == 1 & mig3100 == 1 & mig3200 == 1 ~ 'none',
           # etc..
            mig3100 != 1 & mig3200 != 1 ~ 'both_parents',
            mig3100 != 1 ~ 'one_parent',
            mig3200 != 1 ~ 'one_parent',
           # if no information is available, assume both the person and their parents are born in germany
            TRUE ~ "none"), 
         mig = case_when(
           # if the person is born abroad and is a Citizen... 
            born_abroad == "self" & mig0520 == 1 ~ "Citizen: Own mig experience",
           # etc..
            born_abroad == "self" & mig0520 != 1 ~ "Non-citizen: Own mig experience",
            born_abroad != "self" & mig0520 != 1 ~ "Non-citizen: No own mig experience",
            born_abroad == "one_parent" & mig0520 == 1 ~ "Citizen: Mig background from one parent",
            born_abroad == "both_parents" & mig0520 == 1 ~ "Citizen: Mig background from both parents",
            born_abroad == "none" & mig0520 == 1 ~ "Citizen: No mig background"))

# create an education variable corresponding to the census tables
tl <- tl_w_mig %>% 
  mutate(eca0108 = str_sub(as.character(as_factor(eca0108)), 4, -1),
         school_type = case_when(
           # we had to collapse some categories unto one another in order to ensure correspondance to census categories while attempting to minimise data loss. school types in the comments on the right are what the numerical codes refer to
            edu0400 == 1 ~ "ST1: Primary", # Grundschule
            edu0400 == 2 ~ "ST6: Other school", # Orientierungsschule
            edu0400 == 3 ~ "ST2: Lower secondary", # Hauptschule
            edu0400 == 4 ~ "ST3: Intermediate secondary", # Realschule
            edu0400 == 5 ~ "ST6: Other school", # Verbundene Haupt- und Realschule (auch Sekundar-, Real-, Regel-, Mittel-, Ober- und Wirtschaftsschule, regionale Schule, erweiterte Realschule)
            edu0400 == 6 ~ "ST5: Comprehensive school", # Gesamtschule
            edu0400 == 7 ~ "ST6: Other school", # Waldorfschule
            edu0400 == 8 ~ "ST4: Upper secondary", # Gymnasium (auch Kolleg)
            edu0400 == 9 ~ "ST6: Other school", # Sonderschule/Förderschule
            edu0400 == 10 ~ "ST6: Other school", # Andere Schule
            # "Entfällt, da kein/e Schüler/-innen" is a category in the census so we gave kids who don't have a school type category and who specified as an answer to another question that they no longer go to school this category
            edu0100 == 3 ~ "ST7: No longer at school"),  # "ich gehe nicht mehr in die Schule"
         isced = as.factor(case_when(
           # in TwinLife, isced code is coded starting with age 15 but we use it starting with 19 (see code further below). if the person is still at school at age 19 or older, we assume they  have a primary school certificate ("ISCED 1: Primary")
            eca0108 == "3] -83: in school or training/not in school yet" ~ "ISCED 1: Primary",
           # Exclude not codable 
            eca0108 == "9] -89: not codable" ~ NA_character_,
           # Those are kids younger than 15
            eca0108 == "5] -95: doesn't apply (screened out)" ~ NA_character_,
           # we don't change any coding here, only translation/explanation
            eca0108 == "level 1" ~   "ISCED 1: Primary",
            eca0108 == "level 2a" ~  "ISCED 2: Lower secondary",
            eca0108 == "level 3a" ~  "ISCED 3a: Upper secondary, general",
            eca0108 == "level 3b" ~  "ISCED 3b: Upper secondary, vocational",
            eca0108 == "level 3c" ~  "ISCED 3b: Upper secondary, vocational",
            eca0108 == "level 4a" ~  "ISCED 4: Post-secondary",
            eca0108 == "level 5a" ~  "ISCED 5a: Tertiary, e.g., college",
            eca0108 == " level 5b" ~ "ISCED 5b: Tertiary, e.g., co-op program",
            eca0108 == " level 6" ~  "ISCED 6: PhD")),
           # the final education variable combines the variable school type (age <19) and isced (age > 18)
         educ = as.factor(case_when(
           age0100 <= 18 ~ school_type, 
           age0100 >= 19 ~ isced)),
           # set a large category (upper secondary, vocational) as the reference category of the educ factor
         educ = relevel(educ, 4),
           # same for isced
         isced = relevel(isced, 4)) %>%
 filter(# filter out kids aged <11 who have school type information (because most don't), <= 65 because n per age group < 15 from 66 and older
         between(age0100, 11, 65) &
         !is.na(educ) &
         # remove half the twins to reduce dependency between observations
         ptyp != 1
        )

# save dataset
save(tl, file="../unshareable_data/preprocessed/tl.Rda")

tl %>% group_by(ptyp) %>% as_factor() %>% count()

## # A tibble: 6 × 2
## # Groups:   ptyp [6]
##   ptyp                           n
##   <fct>                      <int>
## 1 2: secondborn twin - u      2884
## 2 200: sibling - s            1032
## 3 300: mother - m             3585
## 4 400: father - f             2329
## 5 500: partner of mother - g   135
## 6 600: partner of father - n    15

1.2 plots

ggplot(tl, aes(x = cft)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(x = "CFT",
       y = "Frequency") +
  theme_minimal()

ggplot(tl, aes(x = age0100)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(x = "age",
       y = "Frequency") +
  theme_minimal()

2 CFT manual norms

The raw means and SDs manually extracted from the CFT-20R manual had to be redacted since Hogrefe (publisher of the manual) did not allow us to share them.

# manual_norms <- tibble(
#          age_group = c('11:1-11:6', '11:7-12', '12:1-12:6', '12:7-13', '13:1-13:6', '13:7-14', '14:1-14:6', '14:7-15', '15:1-16', '16:1-17', '17:1-19:11', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59' ,'60-64'),
#          n_manual =    c(               R E D A C T E D             ),
#          mean_manual = c(               R E D A C T E D             ),
#          sd_manual =   c(               R E D A C T E D             )) %>%
#   filter(!is.na(age_group)) %>%
#   mutate(age_group = case_when(
#                          age_group == "10:1-10:6" | age_group == "10:7-11" ~ '10',
#                          age_group == "11:1-11:6" | age_group == "11:7-12" ~ '11',
#                          age_group == "12:1-12:6" | age_group == "12:7-13" ~ '12',
#                          age_group == "13:1-13:6" | age_group == "13:7-14" ~ '13',
#                          age_group == "14:1-14:6" | age_group == "14:7-15" ~ '14',
#                          age_group == "15:1-16"                            ~ '15',
#                          age_group == "16:1-17"                            ~ '16',
#                          age_group == "17:1-19:11"                         ~ '17-19',
#                          TRUE ~ age_group)) %>%
#   group_by(age_group) %>%
#   summarise(Manual_n = sum(n_manual),
#             Manual_mean = mean(mean_manual),
#             Manual_sd = sqrt(mean(sd_manual^2)),
#             Manual_se_of_mean = Manual_sd/sqrt(Manual_n))

# save(manual_norms, file="../unshareable_data/preprocessed/manual_norms.Rda")
# write.csv(manual_norms, file = "../unshareable_data/preprocessed/manual_norms.csv")

3 Census tables

3.1 Main poststratification table

census_school_type_raw <- readxl::read_excel("data/raw/de_census/729305_Zensus2011_Bildung_Schulform.xlsx", 
    sheet = "Migration und Schulform") 

census_school_type <- census_school_type_raw %>% 
  # remove redundant columns of higher order categories (e.g., Deutsche mit Migrationshintergrund)
  select(1:2, 21:28, 45:60, 69:76, 85:100) %>%
  # exclude sex and age columns
  select(3:50) %>%
  # before iteratively naming the 48 columns, 8 (school types) * 6 (migration background) 
  set_names(map(c("Citizen: No mig background", # Personen ohne Migrationshintergrund
                  "Non-citizen: Own mig experience", # Ausländer/-innen mit eigener Migrationserfahrung
                  "Non-citizen: No own mig experience", # Ausländer/-innen ohne eigene Migrationserfahrung
                  "Citizen: Own mig experience", # Deutsche mit eigener Migrationserfahrung
                  "Citizen: Mig background from both parents", # Deutsche mit beidseitigem Migrationshintergrund
                  "Citizen: Mig background from one parent"), # Deutsche mit einseitigem Migrationshintergrund
                ~ paste0(.x, "_", c("total", 
                                    "ST7: No longer at school", # Entfällt, da kein/e Schüler/-innen
                                    "ST1: Primary", # Grundschule
                                    "ST2: Lower secondary", # Hauptschule
                                    "ST3: Intermediate secondary", # Realschule
                                    "ST4: Upper secondary", # Gymnasium
                                    "ST5: Comprehensive school", # Gesamtschule
                                    "ST6: Other school"))) %>% # Sonstige Schule
            unlist()) %>%
  # recover sex and age columns
  mutate(age = as.numeric(parse_number(census_school_type_raw[[2]])),
         male = census_school_type_raw[[1]],
         ) %>% 
  relocate(c(age,male)) %>% 
  mutate(male = ifelse(row_number() >= 115 & row_number() <= 215, TRUE, FALSE)) %>% 
  slice(-(1:113), -215) %>% 
  # disentangle mig and educ variables from one another
  pivot_longer(cols = 3:50,
               names_to = c("mig", "school_type"),
               names_sep = "_",
               values_to = "census_n") %>%
 # filter(census_n != "/") %>% 
  # set censored cells to 0
  mutate(census_n = ifelse(census_n == "/", 0, as.numeric(census_n)))



census_ISCED_raw <- readxl::read_excel("data/raw/de_census/729305_Zensus2011_Bildung_ISCED.xlsx", sheet = "Migration und ISCED")

census_ISCED <- census_ISCED_raw %>% 
  set_names(paste0("var", 1:133)) %>% 
  select(-(
    census_ISCED_raw %>% 
    summarise(across(everything(), ~ any(str_detect(., "ISCED-Ebene")) & any(str_detect(., "Insgesamt")))) %>% 
    unlist() %>% 
    which()
  )) %>% 
  select(1:2, var24:var34, var57:var78, var90:var100, var112:var133) %>% 
  select(3:56) %>%
  set_names(map(c("Citizen: No mig background", # Personen ohne Migrationshintergrund
                  "Non-citizen: Own mig experience", # Ausländer/-innen mit eigener Migrationserfahrung
                  "Non-citizen: No own mig experience", # Ausländer/-innen ohne eigene Migrationserfahrung
                  "Citizen: Own mig experience", # Deutsche mit eigener Migrationserfahrung
                  "Citizen: Mig background from both parents", # Deutsche mit beidseitigem Migrationshintergrun,
                  "Citizen: Mig background from one parent"), # Deutsche mit einseitigem Migrationshintergrund
         ~ paste0(.x, "_", c("total",
                             "ISCED 1: Primary", # ISCED-Ebene 1 = Primärbereich
                             "ISCED 2: Lower secondary", # ISCED-Ebene 2 = Sekundarbereich I
                             "ISCED 3a: Upper secondary, general", # Sekundarbereich II A, allgemein bildend
                             "ISCED 3b: Upper secondary, vocational", # Sekundarbereich II B, beruflich
                             "ISCED 4: Post-secondary", # ISCED-Ebene 4 = Postsekundäre nichttertiäre Bildung
                             "ISCED 5a: Tertiary, e.g., college", # ISCED-Ebene 5 = Erste Stufe der tertiären Bildung, Tertiärbereich A
                             "ISCED 5b: Tertiary, e.g., co-op program", # ISCED-Ebene 5 = Erste Stufe der tertiären Bildung, Tertiärbereich B
                             "ISCED 6: PhD" # ISCED-Ebene 6 = Zweite Stufe der tertiären Bildung
                             ))) %>% 
            unlist()) %>% 
  mutate(age = as.numeric(parse_number(census_ISCED_raw[[2]])),
         male = census_ISCED_raw[[1]]) %>% 
  relocate(c(age,male)) %>% 
  mutate(male = ifelse(row_number() >= 99 & row_number() <= 186, TRUE, FALSE)) %>% 
  slice(-(1:99), -186) %>% 
  pivot_longer(cols = 3:56,
               names_to = c("mig", "isced"),
               names_sep = "_",
               values_to = "census_n") %>%
 # filter(census_n != "/") %>% 
  mutate(census_n = ifelse(census_n == "/", 0, as.numeric(census_n)))


# 11 because that's the minimum age for the educ and mig variables we used in the TL sample
census_school_type_11_18 <- census_school_type %>% 
  filter((age >= 11 & age <= 18) & school_type != "total") %>% 
  rename(educ = "school_type") %>% 
  relocate(educ, .after = "mig")

# 65 
census_ISCED_19_65 <- census_ISCED %>% 
  filter((age >= 19 & age <= 65) & isced != "total") %>% 
  rename(educ = "isced")

census <- census_school_type_11_18 %>% 
  bind_rows(census_ISCED_19_65) 

save(census, file="data/preprocessed/de_census/census.Rda")

# print random 14 rows/cells/subgroups/combinations out of the total 6528 in the poststratification table
census %>% slice_sample(n = 14) %>% kable %>% kable_styling(full_width = FALSE)

age	male	mig	educ	census_n
57	TRUE	Citizen: Own mig experience	ISCED 3b: Upper secondary, vocational	19380
41	TRUE	Non-citizen: Own mig experience	ISCED 5a: Tertiary, e.g., college	7190
11	TRUE	Citizen: Mig background from both parents	ST6: Other school	1450
64	FALSE	Citizen: Mig background from both parents	ISCED 1: Primary	0
35	FALSE	Citizen: No mig background	ISCED 5a: Tertiary, e.g., college	58550
62	FALSE	Citizen: Mig background from both parents	ISCED 5b: Tertiary, e.g., co-op program	0
50	TRUE	Non-citizen: No own mig experience	ISCED 1: Primary	300
48	TRUE	Non-citizen: No own mig experience	ISCED 2: Lower secondary	830
27	TRUE	Non-citizen: No own mig experience	ISCED 5b: Tertiary, e.g., co-op program	500
28	TRUE	Citizen: Mig background from one parent	ISCED 5a: Tertiary, e.g., college	3480
24	TRUE	Citizen: No mig background	ISCED 1: Primary	9330
61	TRUE	Citizen: No mig background	ISCED 4: Post-secondary	10340
39	FALSE	Citizen: No mig background	ISCED 5a: Tertiary, e.g., college	64090
14	TRUE	Citizen: Mig background from both parents	ST4: Upper secondary	8290

3.2 Totals for the disparities plot

age_margins <- census_school_type_raw %>%
  # total column, rows for ages 11 through 65
  select(3) %>% 
  slice(23:77) %>% 
  mutate(n = as.numeric(`...3`),
         category = as.character(11:65),
         percentage = n/sum(n)*100,
         source = "census",
         variable = "age") %>% 
  select(-1)

sex_margins <- census_school_type_raw %>%
  # total column, rows for ages 11 through 65
  select(3) %>% 
  slice((125:179), (227:281)) %>% 
  mutate(category = as.character(ifelse(row_number() < 56, T, F)),
         n = as.numeric(`...3`)) %>% 
  group_by(category) %>% 
  summarize(n = sum(n)) %>% 
  mutate(source = "census",
         variable = "male",
         percentage = n/sum(n)*100)

school_type_margins <- census_school_type_raw %>% 
  select(14:20) %>% 
  slice(23:30) %>% 
  set_names(paste0(c("ST7: No longer at school", # Entfällt, da kein/e Schüler/-innen
                     "ST1: Primary", # Grundschule
                     "ST2: Lower secondary", # Hauptschule
                     "ST3: Intermediate secondary", # Realschule
                     "ST4: Upper secondary", # Gymnasium
                     "ST5: Comprehensive school", # Gesamtschule
                     "ST6: Other school"))) %>% # Sonstige Schule  
  mutate_all(as.numeric) %>% 
  summarise(across(everything(), ~sum(., na.rm = TRUE))) %>% 
  pivot_longer(cols = everything(), names_to = "category", values_to = "n")



educ_margins <- census_ISCED_raw %>% 
  select(14, 15, 17:19, 21, 22, 23) %>% 
    slice(17:63) %>% 
  set_names(paste0(c("ISCED 1: Primary",                 
                     "ISCED 2: Lower secondary",          
                     "ISCED 3a: Upper secondary, general", 
                     "ISCED 3b: Upper secondary, vocational",
                     "ISCED 4: Post-secondary",           
                     "ISCED 5a: Tertiary, e.g., college",   
                     "ISCED 5b: Tertiary, e.g., co-op program",
                     "ISCED 6: PhD"))) %>% 
  mutate_all(as.numeric) %>% 
  summarise(across(everything(), ~sum(., na.rm = TRUE))) %>% 
  pivot_longer(cols = everything(), names_to = "category", values_to = "n") %>% 
  full_join(school_type_margins, by = "category") %>% 
  mutate(n = ifelse((is.na(n.x) | is.na(n.y)), coalesce(n.x, n.y), n.x + n.y),
         source = "census",
         variable = "educ",
         percentage = n/sum(n)*100) %>% 
  select(-c(n.x,n.y))

mig_margins <- census_school_type_raw %>% 
  select(4, 7, 8, 10, 12, 13) %>% 
  slice(23:77) %>% 
  set_names(paste0(c("Citizen: No mig background", 
                     "Non-citizen: Own mig experience", 
                     "Non-citizen: No own mig experience", 
                     "Citizen: Own mig experience",
                     "Citizen: Mig background from both parents",
                     "Citizen: Mig background from one parent"))) %>% 
  mutate_all(as.numeric) %>% 
  summarise(across(everything(), ~sum(., na.rm = TRUE))) %>% 
  pivot_longer(cols = everything(), names_to = "category", values_to = "n") %>% 
  mutate(source = "census",
         variable = "mig",
         percentage = n/sum(n)*100)
                     
census_margins <- bind_rows(age_margins, educ_margins, sex_margins, mig_margins)

save(census_margins, file="data/preprocessed/de_census/census_margins.Rda")

4 Session info

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 codebook_0.9.6   haven_2.5.4      lubridate_1.9.4  forcats_1.0.0    stringr_1.5.1   
##  [7] dplyr_1.1.4      purrr_1.0.2      readr_2.1.5      tidyr_1.3.1      tibble_3.2.1     ggplot2_3.5.1   
## [13] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9        utf8_1.2.4        generics_0.1.3    xml2_1.3.6        stringi_1.8.4     hms_1.1.3        
##  [7] digest_0.6.36     magrittr_2.0.3    evaluate_0.24.0   grid_4.4.1        timechange_0.3.0  fastmap_1.2.0    
## [13] cellranger_1.1.0  jsonlite_1.8.8    fansi_1.0.6       viridisLite_0.4.2 scales_1.3.0      jquerylib_0.1.4  
## [19] cli_3.6.3         labelled_2.14.0   rlang_1.1.4       munsell_0.5.1     withr_3.0.0       cachem_1.1.0     
## [25] yaml_2.3.8        tools_4.4.1       tzdb_0.4.0        colorspace_2.1-1  vctrs_0.6.5       R6_2.5.1         
## [31] lifecycle_1.0.4   pkgconfig_2.0.3   pillar_1.9.0      bslib_0.7.0       gtable_0.3.6      glue_1.7.0       
## [37] systemfonts_1.1.0 highr_0.11        xfun_0.45         tidyselect_1.2.1  rstudioapi_0.16.0 knitr_1.47       
## [43] farver_2.1.2      htmltools_0.5.8.1 labeling_0.4.3    rmarkdown_2.27    svglite_2.1.3     compiler_4.4.1   
## [49] readxl_1.4.3

Data preprocessing code

2025-03-29