/*==============================================================================

	This template gives instructions to calculate equivalence between a 
	randomized control group and participant customers in a sublap

	This code can be run in Stata as a .do file, or can be adapted to 
	be used in another language to perform equivalency checks. 
	
	Update: 9/14/2018

==============================================================================*/
	
*1. Identify a control pool of at least 150 customers to be selected 
*	via statistical matching or randomly withheld from the participant 
*	population. Create a dataset that has control and participant’s hourly 
*	usage by date from hours ending 1 through 24.

	/* The data should be in the format:
		Participant_ID | Date | kWh1 | kWh2 | kWh3 | ... | kWh23 | kWh24
	*/
	
	* Validation must occur on at least the 20 most recent days out 
	* of the last 75.
	
	* Randomly sample a control group from the population (or create a control
	* variable from another process)
	set seed 11235813	// ensure reproducibility by setting randomization seed
	gsample 150, wor cluster(participant_id) gen(control)
	gen treat = !control
	drop control	
	
*2.	Average the hourly load profile for all treatment group customers and all 
*	control group customers by day and hour. 
	collapse (mean) kwh*, by(date event treat)	

*3.	Flag and remove ineligible days. Ineligible days are event days that the
*	customers in the resource could have participated in, and weekends if 
*	the resource is only dispatched on weekdays. Also remove any days outside 
*	the validation period.	
	local val_date = mdy(8,1,2015)		/// set this validation date (EXAMPLE)
	local val_start = `val_date' - 75
	local val_end = `val_date' - 31
	
	
	gen ineligible = 	inlist(dow(date), 0, 6) | event == 1 | 	/// OR
						!inrange(date, `val_start', `val_end')
	drop if ineligible
	drop ineligible event

*4.	Arrange the data in the appropriate format. 	
	reshape long kwh_, i(date treat) j(hour)	// data long by date/hour
	reshape wide kwh_, i(date hour) j(treat)	// data wide by treat/control
	
	rename kwh_0 kwh_control
	rename kwh_1 kwh_treat
	
	order date hour kwh_treat kwh_control	// data is unique here by date
											// and hour. 
	
*5. Keep only the hours of interest: 12pm-9pm. Data is in hour-ending
	keep if inrange(hour, 13, 21)
	
	
*6. Calculate summary statistics manually:
	*a. Calculate the precurser pieces of both b and cv_rmse
		* 	b = sumproduct(kwh_treat, kwh_control)/sum(kwh_control^2)
		gen b_num 	= kwh_treat * kwh_control
		gen b_denom = kwh_control^2
	
		* 	rmse = sqrt(average((kwh_treat - kwh_control)^2))
		gen err_sq = (kwh_treat - kwh_control)^2
	
	*b. Summarize (ie sum to get num and denom of b, and average error sq
	*	and treatment usage to get cv_rmse
		collapse (sum) b_num b_denom (mean) err_sq kwh_treat
	
	
	*c. Calculate the summary statistics	
		gen b 		= b_num/b_denom
		gen rmse 	= sqrt(err_sq)
		gen cvrmse 	= rmse/kwh_treat
		gen ci_90 	= invnormal(0.95) * cvrmse	// 2-tailed CI at 90%
		
		gen pass_b 		= inrange(b, 0.95, 1.05)
		gen pass_cvrmse = ci_90 <= 0.1