/* Power Calculations for: "A Randomized Controlled Trial on the Provision of Financial and Social Capital to Low-Income Families in the United States"
Authors of the paper: Ania Jaroszewicz (Harvard), Jon Jachimowicz (HBS), Oliver Hauser (Exeter), Nava Ashraf (LSE), Emily Bianchi (Emory), Stephan Meier (Columbia), Johannes Haushofer (Stockholm, Bonn, IFN, Busara)
Authors of this script: Ania Jaroszewicz (ajaroszewicz@hbs.edu), Jon Jachimowicz (jjachimowicz@hbs.edu), & Oliver Hauser (o.hauser@exeter.ac.uk), with help from Johannes Haushofer (johannes.haushofer@ne.su.se)
29 May 2021

Purpose: this file does power calculations for the trial. It incorporates the fact that we have unequal treatment arm Ns and multiple endpoints.*/


***********************Set up****************************
//Initialize
drop _all
clear all
capture log close
set more off
local workdir "[Directory]" //edit this if desired
cd  `workdir'
log using "power_calc.log", replace

//If you do not have it, install clustersampsi. Type "findit clustersampsi" in the command line, then click the Stata journal link st0286_2 and "click here to install".


//Set assumptions
local n0 = 534 //each non-cash arm N after attrition 
local n1 = 216 //each cash arm N after attrition 
local num_tests = 4 //we have 4 outcomes: financial, psychological, health, and family well-being
local alpha = 0.05 / `num_tests' //significance level. Institute a conservative Bonferroni correction to adjust for multiple hypothesis tests
local power = 0.90 //target power
local rho = 0.05 //intraclass correlation; 0.05 is conservative
local size_cv = 0 //coefficient of variation of cluster size. Default is 0 (all clusters are equally sized)
local base_correl = 0.1 //correlation between baseline and endline measurements; 0.1 is conservative 

//Start creating a table you will then fill in with your output. Construct all the variables you need to calculate the Minimal Detectable Effect (MDE) in the power analyses.
set obs 6
gen comparison=""
replace comparison = "NTC vs C" if _n==1
replace comparison = "NTC vs S" if _n==2
replace comparison = "NTC vs CS" if _n==3
replace comparison = "C vs CS" if _n==4
replace comparison = "S vs CS" if _n==5
replace comparison = "C vs S" if _n==6

//Calculate the N you have for each pairwise treatment arm comparison
gen n=.
replace n=round(`n0'+`n1') if comparison == "S vs CS" | comparison == "C vs S" | comparison == "NTC vs C" | comparison == "NTC vs CS"
replace n=round(`n0'+`n0') if comparison == "NTC vs S" 
replace n=round(`n1'+`n1') if comparison == "C vs CS" 

//Calculate the `group' (analysis unit) size. This is easy for the NTC vs C comparison (since no one has a peergroup so the group size is always 1) and for the S vs CS component (since everyone has a peergroup and we want to conservatively assume the largest size of 8). Figuring out the group sizes where 1 treatment arm has a peergroup and the other does not is a bit harder. Do these calculations here and use them below to pipe into the main data
	preserve //1 non-cash and 1 cash arm: NTC vs CS
		clear
		local temp_n=round(`n0'+`n1')
		set obs `temp_n'
		gen num=1 if _n<=`n0' //# in group for NTC (using n0 bc cheap; not in peergroups so unit of analysis is 1)
		replace num=8 if _n>`n0' //# in a group for CS (using n1 bc expensive; we conservatively assume peergroups of 8 even though actual range will be 4-8)
		qui summ num, detail
		local mixedpeergroupmean_ntc_cs=r(mean)
		local mixedpeergroupsd_ntc_cs=r(sd)
	restore
	preserve //1 non-cash and 1 cash arm: S vs C
		clear
		local temp_n=round(`n0'+`n1')
		set obs `temp_n'
		gen num=8 if _n<=`n0' //# in a group for S (using n0 bc cheap; conservatively assume average peergroup size of 8)
		replace num=1 if _n>`n0' //# in a group for C (using n1 bc expensive; not in peergroups so unit of analysis is 1)
		qui summ num, detail
		local mixedpeergroupmean_c_s=r(mean)
		local mixedpeergroupsd_c_s=r(sd)
	restore
	preserve //2 non cash arms: NTC vs S
		clear
		local temp_n=round(`n0'+`n0')
		set obs `temp_n'
		gen num=1 if _n<=`n0' //# in group for NTC (using n0 bc cheap; not in peergroups so unit of analysis is 1)
		replace num=8 if _n>`n0' //# in a group for S (using n0 bc cheap; in peergroups of 8)
		qui summ num, detail
		local mixedpeergroupmean_ntc_s=r(mean)
		local mixedpeergroupsd_ntc_s=r(sd)
	restore
	preserve //2 cash arm: C vs CS
		clear
		local temp_n=round(`n1'+`n1')
		set obs `temp_n'
		gen num=1 if _n<=`n1' //# in a group for C (using n1 bc expensive; not in peergroups so unit of analysis is 1)
		replace num=8 if _n>`n1' //# in a group for CS (using n1 bc expensive; in peergroups of 8)
		qui summ num, detail
		local mixedpeergroupmean_c_cs=r(mean)
		local mixedpeergroupsd_c_cs=r(sd)
	restore

//Calculate average group (peergroup or unit of analysis) sizes for each pairwise comparison (this is an input we need for clustersampsi command)
	gen peergroupsize = .
	replace peergroupsize = 1 if comparison=="NTC vs C" //these folks are coming in on their own, without peergroups, so unit of analysis is 1
	replace peergroupsize = 8 if comparison=="S vs CS" //take most convservative estimate-- assume that everyone ends up in a peergroup of size 8 
	replace peergroupsize = ceil(`mixedpeergroupmean_ntc_cs') if comparison == "NTC vs CS"
	replace peergroupsize = ceil(`mixedpeergroupmean_c_s') if comparison == "C vs S" 
	replace peergroupsize = ceil(`mixedpeergroupmean_ntc_s') if comparison == "NTC vs S" 
	replace peergroupsize = ceil(`mixedpeergroupmean_c_cs') if comparison == "C vs CS"
	compress

//Calculate the number of peergroups or units of analysis for each pairwise comparison (also needed for clustersampsi command)
gen peergroupnum = round(n/peergroupsize) 
tabstat peergroupsize, by(comparison)
tabstat peergroupnum, by(comparison)

//Calculate the coefficient of variation for each pairwise comparisons (also needed for clustersampsi command). This should be 0 when there is no variation (NTC vs C and S vs CS)
gen size_cv = 0
replace size_cv=`mixedpeergroupsd_ntc_cs'/`mixedpeergroupmean_ntc_cs' if comparison == "NTC vs CS" 
replace size_cv=`mixedpeergroupsd_c_s'/`mixedpeergroupmean_c_s' if comparison == "C vs S" 
replace size_cv=`mixedpeergroupsd_ntc_s'/`mixedpeergroupmean_ntc_s' if comparison == "NTC vs S"
replace size_cv=`mixedpeergroupsd_c_cs'/`mixedpeergroupmean_c_cs' if comparison == "C vs CS"

encode comparison, gen(comparison_num)            

gen mde=.


**********************Do the power calculations******************

/*To do the power calculations, we use 2 different power commands: 
a) clustersampsi is good because it allows for clustered RCTs. But, it does not allow for multiple endlines. It outputs MDE (mimimum detectable effect sizes).
b) sampsi is good because it allows for multiple endlines. But, it does not allow for clustering. Also unfortunately, sampsi only outputs either N or power, not MDE. We know our target power and our Ns. So, we vary MDE to identify at what stage power hits the target level and save that value.

How we combine the two power commands: 
Step #1: We first use clustersampsi to calculate the MDEs when we have 1 endline and clusters. 
Step #2: We then use sampsi twice: first we run it using 1 endline, and then with 6 endlines (which is what we actually have in the trial; Months 3, 6, 9, 12, 15, and 18). Neither of these are run with clusters since it's not allowed
Step #3: We calculate the percent difference in MDEs for the two sampsi tests-- i.e., how much we gain in terms of MDE as a percent as we move from 1 endline measure to 6
Step #4: We then apply the percentage calculated in Step #3 back to the clustersampsi results from Step #1 to identify how much our MDEs improve when we take into account the clusters. 

Since our pairwise comparisons vary in N (due to the unequal treatment group sizes), we run Steps #2-#4 three times, once when we're comparing a non cash vs. cash arm; once when we're comparing two non cash arms; and once when we're comparing two cash arms.*/


//Step #1: Clustersampsi
	forvalues c = 1/6 {
		preserve
		keep if comparison_num==`c'
			local peergroupsize=peergroupsize[1]
			local peergroupnum=peergroupnum[1]
			local size_cv=size_cv[1]
			dis `peergroupsize'
			dis `peergroupnum'
			dis `size_cv'
			qui clustersampsi, detectabledifference alpha(`alpha') beta(`power') m(`peergroupsize') k(`peergroupnum') rho(`rho') size_cv(`size_cv') base_correl(`base_correl')
			replace mde=real(r(DD)) 
			save temp_comp`c'.dta, replace
		restore
	}
	clear
	use temp_comp1
	append using temp_comp2 temp_comp3 temp_comp4 temp_comp5 temp_comp6
	dis "If we just think about a single baseline, we are powered to see MDEs of:"
	tabstat mde, by(comparison)


//Step #2.1: sampsi when there is 1 endline measure with Ns corresponding to each non cash vs cash pairwise comparison: NTC vs C, S vs C, S vs CS, NTC vs CS.
	local power_post1=0
	local end_correl=0.5 
	local mde_post1=0
	forvalues i = 0.01(0.01)0.50 {
		if `power_post1'<`power' {
			qui sampsi 0 `i', n1(`n0') n2(`n1') pre(1) post(1) r01(`base_correl') r1(`end_correl') sd1(1) sd2(1) alpha(`alpha')
			local power_post1=`r(power)' 
			local mde_post1=`i'
			}
	}
	dis "For NTC vs C, S vs C, S vs CS, NTC vs CS w/ 1 baseline: At MDE of `mde_post1', power is `power_post1'."
	

//Step #2.2: sampsi when there are 6 endline measurements with Ns corresponding to each non cash vs cash pairwise comparison: NTC vs C, S vs C, S vs CS, NTC vs CS. 
	local postnum = 6  
	gen mde_post`postnum'=.
	local power_post`postnum'=0
	local mde_post`postnum'=0
	forvalues i = 0.01(0.01)0.50 {
		if `power_post`postnum''<`power' {
			qui sampsi 0 `i', n1(`n0') n2(`n1') pre(1) post(`postnum') r01(`base_correl') r1(`end_correl') sd1(1) sd2(1) alpha(`alpha')
			local power_post`postnum'=`r(power)' 
			local mde_post`postnum'=`i'
			}
	}
	dis "For NTC vs C, S vs C, S vs CS, NTC vs CS w/ `postnum' baselines: At MDE of `mde_post`postnum'', power is `power_post`postnum''."
	

//Step #3: Compute how MDE changed (as a percentage) as we increased the endline measurements in sampsi from 1 to 6.
	local mde_post1_vs_post`postnum' = `mde_post`postnum''/`mde_post1'
	dis "For NTC vs C, S vs C, S vs CS, NTC vs CS: The MDE drops from `mde_post1' to `mde_post`postnum'' as we go from 1 endline measurement to `postnum' endline measurements. This means it goes down to `mde_post1_vs_post`postnum'' of its original value."

//Step #4: Apply that percentage to the clustersampsi result
	replace mde_post`postnum'=mde*`mde_post1_vs_post`postnum'' if comparison=="NTC vs C" | comparison=="NTC vs CS" | comparison=="S vs CS" | comparison=="C vs S"

	
//Now repeat this exercise for NTC vs S: 
	local power_post1=0
	local mde_post1=0
	forvalues i = 0.01(0.01)0.50 {
		if `power_post1'<`power' {
			qui sampsi 0 `i', n1(`n0') n2(`n0') pre(1) post(1) r01(`base_correl') r1(`end_correl') sd1(1) sd2(1) alpha(`alpha')
			local power_post1=`r(power)' 
			local mde_post1=`i'
			}
	}
	dis "For NTC vs S w/ 1 baseline: At MDE of `mde_post1', power is `power_post1'."
	local power_post`postnum'=0
	local mde_post`postnum'=0
	forvalues i = 0.01(0.01)0.50 {
		if `power_post`postnum''<`power' {
			qui sampsi 0 `i', n1(`n0') n2(`n0') pre(1) post(`postnum') r01(`base_correl') r1(`end_correl') sd1(1) sd2(1) alpha(`alpha')
			local power_post`postnum'=`r(power)' 
			local mde_post`postnum'=`i'
			}
	}
	dis "For NTC vs S w/ `postnum' baselines: At MDE of `mde_post`postnum'', power is `power_post`postnum''."
	local mde_post1_vs_post`postnum' = `mde_post`postnum''/`mde_post1'
	dis "For NTC vs S: The MDE drops from `mde_post1' to `mde_post`postnum'' as we go from 1 endline measurement to `postnum' endline measurements. This means it goes down to `mde_post1_vs_post`postnum'' of its original value."
	replace mde_post`postnum' = mde*`mde_post1_vs_post`postnum'' if comparison=="NTC vs S"

//Now repeat this exercise for C vs CS: 
	local power_post1=0
	local mde_post1=0
	forvalues i = 0.01(0.01)0.50 {
		if `power_post1'<`power' {
			qui sampsi 0 `i', n1(`n1') n2(`n1') pre(1) post(1) r01(`base_correl') r1(`end_correl') sd1(1) sd2(1) alpha(`alpha')
			local power_post1=`r(power)' 
			local mde_post1=`i'
			}
	}
	dis "For C vs CS w/ 1 baseline: At MDE of `mde_post1', power is `power_post1'."
	local power_post`postnum'=0
	local mde_post`postnum'=0
	forvalues i = 0.01(0.01)0.50 {
		if `power_post`postnum''<`power' {
			qui sampsi 0 `i', n1(`n1') n2(`n1') pre(1) post(`postnum') r01(`base_correl') r1(`end_correl') sd1(1) sd2(1) alpha(`alpha')
			local power_post`postnum'=`r(power)' 
			local mde_post`postnum'=`i'
			}
	}
	dis "For C vs CS w/ `postnum' baselines: At MDE of `mde_post`postnum'', power is `power_post`postnum''."
	local mde_post1_vs_post`postnum' = `mde_post`postnum''/`mde_post1'
	dis "For C vs CS: The MDE drops from `mde_post1' to `mde_post`postnum'' as we go from 1 endline measurement to `postnum' endline measurements. This means it goes down to `mde_post1_vs_post`postnum'' of its original value."
	replace mde_post`postnum' = mde*`mde_post1_vs_post`postnum'' if comparison=="C vs CS"


****************Summarize the results****************
	dis "In summary: For cash arms of N=`n1' and non-cash arms of N=`n0', if we take into account the fact that we have `postnum' endline measurements, we are powered to see MDEs of:"
	tabstat mde_post`postnum', by(comparison)


****************Closeout********************														
order comparison 
keep comparison mde_post6
rename mde_post6 mde
export excel using "power_calculation_output.xlsx", replace firstrow(varlabels) keepcellfmt
local datafiles: dir "`workdir'" files "temp*.dta"
foreach datafile of local datafiles {
	rm `datafile'
}
log close