Using the Student and Country Data

Introduction

The goal of learningtower is to provide a user-friendly R package to provide easy access to a subset of variables from PISA data collected from the OECD. Version 1.1.0 of this package provides the data for the years 2000 - 2022. The survey data is published every three years. This is an excellent real world dataset for data exploring, data visualizing and statistical computations.

This vignette documents how to access the data, and shows a few typical methods to explore the data.

Exploring the student data

Usage of the subset of the student data

Below is a quick example of loading the 2018 subset student data.

library(dplyr)
library(ggplot2)
library(learningtower)

#load the subset student data for the year 2018
data(student_subset_2018)
#load the countrycode data
data(countrycode)

glimpse(student_subset_2018)
#> Rows: 1,900
#> Columns: 22
#> $ year        <int> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018…
#> $ country     <fct> AUS, AUS, AUS, AUS, AUS, AUS, AUS, AUS, AUS, AUS, AUS, AUS…
#> $ school_id   <chr> "3600310", "3600251", "3600639", "3600704", "3600736", "36…
#> $ student_id  <int> 3606824, 3605794, 3613122, 3611607, 3606403, 3621081, 3611…
#> $ mother_educ <fct> "ISCED 3A", "ISCED 2", "ISCED 3A", "ISCED 3A", NA, "ISCED …
#> $ father_educ <fct> "ISCED 3A", "ISCED 1", "ISCED 3A", "ISCED 1", NA, "ISCED 3…
#> $ gender      <fct> male, female, female, female, male, female, male, female, …
#> $ computer    <fct> yes, yes, yes, no, yes, yes, yes, yes, yes, yes, yes, yes,…
#> $ internet    <fct> yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, yes,…
#> $ math        <dbl> 427.631, 479.464, 490.057, 517.379, 527.985, 462.708, 307.…
#> $ read        <dbl> 425.307, 513.694, 593.488, 588.042, 535.348, 401.866, 258.…
#> $ science     <dbl> 369.288, 486.794, 469.697, 584.995, 560.943, 467.130, 325.…
#> $ stu_wgt     <dbl> 3.11283, 27.41035, 14.56053, 21.68163, 15.57013, 10.96668,…
#> $ desk        <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes…
#> $ room        <fct> yes, yes, yes, yes, yes, yes, yes, yes, no, yes, yes, yes,…
#> $ dishwasher  <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
#> $ television  <fct> 1, 3+, 3+, 2, 2, 3+, 2, 3+, 3+, 3+, 3+, 3+, 2, 1, NA, 2, 1…
#> $ computer_n  <fct> 2, 1, 3+, 1, 3+, 3+, 3+, 3+, 3+, 2, 3+, 1, 3+, 3+, NA, 3+,…
#> $ car         <fct> 3+, 3+, 3+, 3+, 2, 3+, 2, 2, 1, 3+, 3+, 2, 2, 2, NA, 3+, 1…
#> $ book        <fct> 201-500, 101-200, 0-10, 0-10, 26-100, 201-500, 101-200, 26…
#> $ wealth      <dbl> 0.1688, 0.6327, 1.4097, -0.1318, 0.2886, 1.8571, 0.9773, 0…
#> $ escs        <dbl> 1.2100, -0.8160, 0.0932, -0.4153, NA, 0.7428, 0.2569, -0.6…
selected_countries = c("AUS", "USA", "TUR", "SWE", 
                       "CHE", "NZL", "BEL", "DEU")

student_subset_2018 |> 
  group_by(country, gender) |> 
  dplyr::filter(country %in% selected_countries) |>
  dplyr::left_join(countrycode, by = "country") |> 
  ggplot(aes(x = math,
             y = country_name,
             fill = gender)) +
  geom_boxplot() +
  scale_fill_manual(values = c("#FF7F0EFF", "#1F77B4FF")) +
  theme_classic() +
  labs(x = "Math score", 
       y = "")

Usage of the entire student data

#load the entire student data for the year 2018
student_data_2018 <- load_student(2018)

#load the entire student data for two of the years (2012, 2018)
student_data_2012_2018 <- load_student(c(2012, 2018))

#load the entire student 
student_data_all <- load_student("all")
student_data_2012_2018 <- load_student(c(2012, 2018))

plot_data <- student_data_2012_2018 |> 
  group_by(country, year) |>  
  dplyr::filter(country %in% selected_countries) |> 
  dplyr::summarise(avg_math = mean(math, na.rm = TRUE)) |>  
  left_join(countrycode, by = "country") |> 
  dplyr::select(country_name, year, avg_math) |> 
  ungroup() |> 
  dplyr::mutate(
    label_x_pos = ifelse(year == 2012, 2012 - 2, 2018 + 1),
    label = ifelse(
      year == 2012,
      paste0(country_name, ", ", round(avg_math)),
      round(avg_math)))
  
plot_data |> 
  ggplot(aes(x = year, 
             y = avg_math,
             label = label,
             colour = country_name,
             group = country_name)) +
  geom_point() +
  geom_line() +
  geom_vline(xintercept=2012,
             linetype="dashed",
             linewidth=0.1) +
  geom_vline(xintercept=2018,
             linetype="dashed",
             linewidth=0.1) + 
  geom_text(aes(x = label_x_pos),
            position = position_nudge(y = 0)) +
  scale_x_continuous(breaks = c(2012, 2018),
                     limits = c(2008, 2020)) +
  scale_colour_manual(values = c("#1F77B4FF", "#FF7F0EFF", "#2CA02CFF", "#D62728FF", 
                                 "#9467BDFF", "#8C564BFF", "#E377C2FF", "#7F7F7FFF")) +
  labs(x = "",
       y = "Average maths score") +
  theme_classic() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none")