#---------------------------------------------------------------------------#
# Project:    Surfeel
# File:       Randomise treatment
# Author:     Boelmann, Morin, Person, Santiago Wolf
# Date:       29/11/2024
#---------------------------------------------------------------------------#

# This file randomises treatment and control classes within schools, separately
# for kindergarten (III.A) and elementary school (III.B) classes. For small
# schools with little kindergarten classes, we randomise on the school level.

# Input file: Classes_overview.csv.
# Output file: randomisation_classes.csv.
# This will be run on R version 4.3.2, running under macOS 15.1.1.

# Paths
RAND_PATH <- paste0(getwd(), "/")

# Packages
library(dplyr) # version used: dplyr_1.1.4


#---------------------------------------------------------------------------#
# I. Preparation
#---------------------------------------------------------------------------#

# Set seed.
# Seed will be changed to DAX Index value at 2 Dec 2024, 11:00 am CET.
set.seed(19458.37)
#set.seed(0)

# Read in class level data.
classes <- read.csv(paste0(RAND_PATH, "Classes_overview.csv"))


#---------------------------------------------------------------------------#
# II. Function definitions
#---------------------------------------------------------------------------#

# Function to randomise within school
#---------------------------------------------------------------------------#

rand_classes <- function(df_classes) {
  
  # Add a new column "treat" to the dataset which is deterministic for now so that
  # we ensure to have both 0 and 1 in each school. These values 0 and 1 will later
  # be "shuffled" randomly within schools.
  df_classes <- df_classes %>%
    group_by(school) %>%
    mutate(treat = {
      n <- n() # Number of observations in the school
      treat_values <- rep(c(1, 0), length.out = n) # Ensure both 0 and 1 exist
    }) %>%
    ungroup()
  
  # Calculate the number of observations per school.
  school_counts <- df_classes %>%
    group_by(school) %>%
    summarise(count = n())
  
  # Identify schools with uneven number of observations.
  uneven_schools <- school_counts %>%
    filter(count %% 2 != 0) %>%
    pull(school)
  
  # Number of schools with uneven observations.
  nb_schools_uneven <- length(uneven_schools)
  
  # Half of these with uneven numbers need to get one 1 replaced to a 0 to ensure
  # overall balance. If the list of uneven-class schools is of uneven length
  # itself, we randomly decide whether to take one more or one less.
  nb_schools_need_change=floor(nb_schools_uneven/2)
  
  if (nb_schools_uneven %% 2 != 0) {
    nb_schools_need_change= nb_schools_need_change + sample(c(0, 1), size = 1)
  }
  
  # Randomly sample schools where change will be made.
  schools_need_change <- sample(uneven_schools, 
                                size = nb_schools_need_change, 
                                replace = FALSE)
  
  # Adjust one value of treat if sampled.
  df_classes <- df_classes %>%
    group_by(school) %>%
    mutate(treat = ifelse(school %in% schools_need_change & row_number() == 1, 0, treat)) %>%
    ungroup()
  
  # Randomly shuffle the 0,1 variables within a school.
  df_classes <- df_classes %>%
    group_by(school) %>%
    mutate(treat = sample(treat)) %>% # Randomize treat within each school group, see https://stackoverflow.com/questions/67195016/randomly-sort-rows-within-group
    ungroup()
  
  # Return the processed dataset.
  return(df_classes)
}


# Function to randomise across schools
#---------------------------------------------------------------------------#

rand_schools <- function(df_classes) {
  
  # Get the list of unique schools.
  unique_schools <- unique(df_classes$school)
  
  # Decide how many schools to treat. If not an even number, randomly decide if 
  # one more or less is treated.
  nb_schools_treat=floor(length(unique_schools) / 2)
  
  if (length(unique_schools) %% 2 != 0) {
    nb_schools_treat= nb_schools_treat+ sample(c(0, 1), size = 1)
  }
  
  # Randomly sample half of the unique schools.
  treated_schools <- sample(unique_schools, size = nb_schools_treat, replace = FALSE)
  
  # Create the indicator variable treat
  df_classes <- df_classes %>%
    mutate(treat = ifelse(school %in% treated_schools, 1, 0))
  
  # Return the processed dataset.
  return(df_classes)
}


#---------------------------------------------------------------------------#
# III. Randomise
#---------------------------------------------------------------------------#

## A) Kindergarten classes
#---------------------------------------------------------------------------#

# Get kindergarten classes
classes_kg <- classes[classes$type == "Kindergarden", c("district", "school", "class")]

### In Lisbon, we can randomise within school.
classes_kg_1 <- rand_classes(classes_kg %>% filter(district=="Lisbon"))

# Verify overall balance.
table(classes_kg_1$treat) 

# Check if both 0 and 1 exist in the 'treat' column in each school.
classes_kg_1 %>%
  group_by(school) %>%
  summarise(has_treat_0 = any(treat == 0),
            has_treat_1 = any(treat == 1)) 

### For other districts, we randomise on school level. 
classes_kg_2 <- rand_schools(classes_kg %>%
                               filter(district != "Lisbon"))

## B) Elementary school classes
#---------------------------------------------------------------------------#
# We randomise within schools. 

# Get classes.
classes_el <- classes[classes$type == "Elementary", c("district", "school", "class")]

# Randomise within schools.
classes_el <- rand_classes(classes_el)

# Verify overall balance.
table(classes_el$treat) 

# Check if both 0 and 1 exist in the 'treat' column in each school.
classes_el %>%
  group_by(school) %>%
  summarise(has_treat_0 = any(treat == 0),
            has_treat_1 = any(treat == 1)) 

#---------------------------------------------------------------------------#
# IV. Export results
#---------------------------------------------------------------------------#

# Append kindergarten and elementary school.
combined_classes <- bind_rows(classes_kg_1, classes_kg_2, classes_el) %>%
  arrange(district, school, treat)

# Verify overall balance.
table(combined_classes$treat) 

# Check if both 0 and 1 exist in the 'treat' column in each school with
# kindergarten exception.
combined_classes %>%
  group_by(school) %>%
  summarise(has_treat_0 = any(treat == 0),
            has_treat_1 = any(treat == 1)) 

# Export as csv.
write.csv(combined_classes, paste0(RAND_PATH, "randomisation_classes.csv"))


#---------------------------------------------------------------------------#
# End of file
#---------------------------------------------------------------------------#