Note that you cannot run the code here unless you have requested and
obtained the TwinLife data. You do not need to do this however, to
reproduce the tutorial code in the RMarkdown document 02_tutorial.Rmd.
The synthetic dataset we produce here is available on our OSF project
and can be used for that purpose. The comparisons to the Raw estimates
(Figures 3 and 4 in the manuscript, see document 03_norm_comparisons)
can also be reproduced based on the synthetic dataset. The comparisons
to the Manual norms cannot be made reproducible as we failed to obtain
permission to share those values.
Preprocessing
tl_prelim <-
haven::read_dta("../unshareable_data/raw/ZA6701_person_wid1_v8-0-0.dta") %>%
select(c("wid", "fid", "pid", "ptyp", "age0100", "age0101", "sex", "mig0520", "mig2000", "mig3100", "mig3200", "edu0100", "edu0400", "eca0108", "igf0182", "igf0282", "igf0382", "igf0482", "eca0105", "inc0401", "eca0230")) %>%
codebook::detect_missing(ninety_nine_problems = T) %>%
filter(complete.cases(igf0182, igf0282, igf0382, igf0482)) %>%
# total sum scores of the sum scores of all 4 subtests, automatically excludes invalid and NA on subtests
mutate(cft = igf0182 + igf0282 + igf0382 + igf0482,
# divide age in months variable by 12 for easier interpretablility
age = age0101/12,
# create logical sex variable
male = sex == 1,
# create age groups corresponding to those in the manual
age_group = case_when(
age0100 == 11 ~ '11',
age0100 == 12 ~ '12',
age0100 == 13 ~ '13',
age0100 == 14 ~ '14',
age0100 == 15 ~ '15',
age0100 == 16 ~ '16',
age0100 >= 17 & age0100 <= 19 ~ '17-19',
age0100 >= 20 & age0100 <= 24 ~ '20-24',
age0100 >= 25 & age0100 <= 29 ~ '25-29',
age0100 >= 30 & age0100 <= 34 ~ '30-34',
age0100 >= 35 & age0100 <= 39 ~ '35-39',
age0100 >= 40 & age0100 <= 44 ~ '40-44',
age0100 >= 45 & age0100 <= 49 ~ '45-49',
age0100 >= 50 & age0100 <= 54 ~ '50-54',
age0100 >= 55 & age0100 <= 59 ~ '55-59',
age0100 >= 60 & age0100 <= 64 ~ '60-64',
TRUE ~ NA_character_))
tl_w_mig <- tl_prelim %>%
# since the migration variable in the census encodes information about both one's own and the parent's migration background and experience ("Migrationshintergrund und -erfahrung"), we start by creating a variable indicating whether the person and/or their parents is/are born abroad
# assume German citizenship lacking information about citizenship
mutate(mig0520 = coalesce(mig0520, 1),
born_abroad = case_when(
# if the person themselves is not born in germany, then born_abroad = "self"
mig2000 != 1 ~ 'self',
# if the person and both their parents are born in germany, then born_abroad = "none"
mig2000 == 1 & mig3100 == 1 & mig3200 == 1 ~ 'none',
# etc..
mig3100 != 1 & mig3200 != 1 ~ 'both_parents',
mig3100 != 1 ~ 'one_parent',
mig3200 != 1 ~ 'one_parent',
# if no information is available, assume both the person and their parents are born in germany
TRUE ~ "none"),
mig = case_when(
# if the person is born abroad and is a Citizen...
born_abroad == "self" & mig0520 == 1 ~ "Citizen: Own mig experience",
# etc..
born_abroad == "self" & mig0520 != 1 ~ "Non-citizen: Own mig experience",
born_abroad != "self" & mig0520 != 1 ~ "Non-citizen: No own mig experience",
born_abroad == "one_parent" & mig0520 == 1 ~ "Citizen: Mig background from one parent",
born_abroad == "both_parents" & mig0520 == 1 ~ "Citizen: Mig background from both parents",
born_abroad == "none" & mig0520 == 1 ~ "Citizen: No mig background"))
# create an education variable corresponding to the census tables
tl <- tl_w_mig %>%
mutate(eca0108 = str_sub(as.character(as_factor(eca0108)), 4, -1),
school_type = case_when(
# we had to collapse some categories unto one another in order to ensure correspondance to census categories while attempting to minimise data loss. school types in the comments on the right are what the numerical codes refer to
edu0400 == 1 ~ "ST1: Primary", # Grundschule
edu0400 == 2 ~ "ST6: Other school", # Orientierungsschule
edu0400 == 3 ~ "ST2: Lower secondary", # Hauptschule
edu0400 == 4 ~ "ST3: Intermediate secondary", # Realschule
edu0400 == 5 ~ "ST6: Other school", # Verbundene Haupt- und Realschule (auch Sekundar-, Real-, Regel-, Mittel-, Ober- und Wirtschaftsschule, regionale Schule, erweiterte Realschule)
edu0400 == 6 ~ "ST5: Comprehensive school", # Gesamtschule
edu0400 == 7 ~ "ST6: Other school", # Waldorfschule
edu0400 == 8 ~ "ST4: Upper secondary", # Gymnasium (auch Kolleg)
edu0400 == 9 ~ "ST6: Other school", # Sonderschule/Förderschule
edu0400 == 10 ~ "ST6: Other school", # Andere Schule
# "Entfällt, da kein/e Schüler/-innen" is a category in the census so we gave kids who don't have a school type category and who specified as an answer to another question that they no longer go to school this category
edu0100 == 3 ~ "ST7: No longer at school"), # "ich gehe nicht mehr in die Schule"
isced = as.factor(case_when(
# in TwinLife, isced code is coded starting with age 15 but we use it starting with 19 (see code further below). if the person is still at school at age 19 or older, we assume they have a primary school certificate ("ISCED 1: Primary")
eca0108 == "3] -83: in school or training/not in school yet" ~ "ISCED 1: Primary",
# Exclude not codable
eca0108 == "9] -89: not codable" ~ NA_character_,
# Those are kids younger than 15
eca0108 == "5] -95: doesn't apply (screened out)" ~ NA_character_,
# we don't change any coding here, only translation/explanation
eca0108 == "level 1" ~ "ISCED 1: Primary",
eca0108 == "level 2a" ~ "ISCED 2: Lower secondary",
eca0108 == "level 3a" ~ "ISCED 3a: Upper secondary, general",
eca0108 == "level 3b" ~ "ISCED 3b: Upper secondary, vocational",
eca0108 == "level 3c" ~ "ISCED 3b: Upper secondary, vocational",
eca0108 == "level 4a" ~ "ISCED 4: Post-secondary",
eca0108 == "level 5a" ~ "ISCED 5a: Tertiary, e.g., college",
eca0108 == " level 5b" ~ "ISCED 5b: Tertiary, e.g., co-op program",
eca0108 == " level 6" ~ "ISCED 6: PhD")),
# the final education variable combines the variable school type (age <19) and isced (age > 18)
educ = as.factor(case_when(
age0100 <= 18 ~ school_type,
age0100 >= 19 ~ isced)),
# set a large category (upper secondary, vocational) as the reference category of the educ factor
educ = relevel(educ, 4),
# same for isced
isced = relevel(isced, 4)) %>%
filter(# filter out kids aged <11 who have school type information (because most don't), <= 65 because n per age group < 15 from 66 and older
between(age0100, 11, 65) &
!is.na(educ) &
# remove half the twins to reduce dependency between observations
ptyp != 1
)
# save dataset
save(tl, file="../unshareable_data/preprocessed/tl.Rda")
tl %>% group_by(ptyp) %>% as_factor() %>% count()
## # A tibble: 6 × 2
## # Groups: ptyp [6]
## ptyp n
## <fct> <int>
## 1 2: secondborn twin - u 2884
## 2 200: sibling - s 1032
## 3 300: mother - m 3585
## 4 400: father - f 2329
## 5 500: partner of mother - g 135
## 6 600: partner of father - n 15