/* cgmdyad_tmp.ado - modification from 5/7/2012 version of cgmdyad.ado, trying to implement new method that can deal with
large data sets.  basic idea:

1. do 2-way robust
2. go into mata, create a list of the "dyad only" cells in the middle of the VCV matrix.  Add up over those cells, then pass this back to stata
add this to the 2-way robust matrix.
3. do the eigenfix routine

*/

cap prog drop regdyad2
program define regdyad2, eclass byable(onecall) sortpreserve

	syntax anything [if] [in] [aweight fweight iweight pweight /], Dyads(string) [NOEIGenfix NODOF JACKresid *]

	/* deal with weights */
	if "`weight'"~="" {
		local weight "[`weight'=`exp']"
	} 
	else {
		local weight ""
	}

	/* main regression */
	qui regress `anything' `if' `in' `weight', `options'

	/* copy some information that regress provides */
	tempvar touse
	qui gen `touse' = e(sample)		/* note that this will take care of zero-weight cases */

	tempname b v_naive
	mat `b' = e(b)
	mat `v_naive' = e(V)

	local n = e(N)
	local N = e(N)
	local depname = e(depvar)
	local RHSvars_tmp : rowfullnames e(V)
	local RHSvars =  regexr("`RHSvars_tmp'","_cons","") /* remove "_cons" */
	local k = e(df_m) + 1	


	/* generate the residuals */
	tempvar resid usedinreg groupvar
	qui predict double `resid' if `touse'==1, residual
	
	local dyad_id1 = word("`dyads'",1) 
	local dyad_id2 = word("`dyads'",2) 
	
	/* if "jackresid" option selected, jackknife over nodes to get predicted residuals... then use these */
	if "`jackresid'" == "jackresid" { 
	
		tempvar resid_jk1 resid_jk2 resid_avg
		tempfile analysisdata

		qui gen `resid_jk1' = . 
		qui gen `resid_jk2' = .
		qui save `analysisdata'
		

		// make a list of nodes.  To do this,
		// string node lists along, dyads1 then dyads2.  Then elimitate duplicates.
		tempfile nodelist 
		tempvar node_id
		stack `dyad_id1' `dyad_id2' , into(`node_id') clear 
		contract `node_id' 
		keep `node_id' 
		sort `node_id' 
		qui summ `node_id' 
		local num_nodes = r(N) 
		qui save `nodelist' , replace 
		
		// make a loop to load up the node_ids into a bunch of local macros
		forvalues i = 1/`num_nodes' { 
			local node_item_`i' = `node_id'[`i'] 
		} 
	
	
		// loop to go through nodes list
		forvalues i = 1/`num_nodes' { 
			qui use `analysisdata' , replace 
			qui drop if `dyad_id1' == `node_item_`i'' | `dyad_id2' == `node_item_`i''   // the Jackknife step, drop node i.
			qui regress `anything' `if' `in' `weight', `options'

			tempvar resid_junk

			qui use `analysisdata' , replace
			qui predict double `resid_junk' , resid
			qui replace `resid_jk1' = `resid_junk' if `dyad_id1' == `node_item_`i'' 
			qui replace `resid_jk2' = `resid_junk' if `dyad_id2' == `node_item_`i'' 
			drop `resid_junk'
			qui save `analysisdata' , replace
	
		list `resid' `resid_jk1' `resid_jk2'
			
		}  	// end of i loop through nodes

		gen `resid_avg' = (`resid_jk1' + `resid_jk2') / 2
//		summ
//		list `resid' `resid_avg' `resid_jk1' `resid_jk2'
		
		qui replace `resid' = `resid_avg'
		
	} 		// end of "if jackresid"
	
	
	
	/* cluster 3 times: way 1, way 2, way1-by-way2.  Each time, take the VCV, un-do the D.O.F. correction, and keep the matrix */
	/* create a new cluster variable which combines the two dyad variables; for combination */
	qui egen `groupvar' = group(`dyads') if `touse'

	*noi list `dyad_id1' `dyad_id2' `groupvar' in 1/30
	
	qui regress `anything' `if' `in' `weight', `options' cluster(`dyad_id1')
	/* obtain the VCV matrix, and Stata's M, N, and K: #clusters, #obs, #covars */
	tempname vcv_clu1 vcv_clu2 vcv_clu12
	matrix `vcv_clu1' = e(V) 
	local N_clu1 = e(N) 
	local M_clu1 = e(N_clust) 
	local K_clu1 = e(df_m) 
	qui regress `anything' `if' `in' `weight', `options' cluster(`dyad_id2')
	matrix `vcv_clu2' = e(V) 
	local N_clu2 = e(N) 
	local M_clu2 = e(N_clust) 
	local K_clu2 = e(df_m) 
	qui regress `anything' `if' `in' `weight', `options' cluster(`groupvar')
	matrix `vcv_clu12' = e(V) 
	local N_clu12 = e(N) 
	local M_clu12 = e(N_clust) 
	local K_clu12 = e(df_m) 


	preserve 
	qui keep if `touse' == 1 

	/*  Step 4:  call the mata function that creates the "cross-dyad" elements of the VCV matrix */
	noi mata: mataest("`b'","`resid'","`dyads'")		/* noi for debugging, qui for operation */
	local num_underlying = numclu[1,1]

	/* combine matavhat, and the 3 elements from the 3 one-way clustering to form Dyad-VCV matrix */
	/* the DOF correction is {M/(M-1)}*{(N-1)/N-K)}.  So to undo the correction, multiply by:
		{(M-1)/M}*{(N-K)/N-1)}.	*/
	tempname vhat rawvhat tmp1 tmp2 tmp12
	
	local dof1 = 1 / (( (`M_clu1'-1) / (`M_clu1') ) * ( (`N_clu1'-`K_clu1') / (`N_clu1'- 1) ))
	local dof2 = 1 / (( (`M_clu2'-1) / (`M_clu2') ) * ( (`N_clu2'-`K_clu2') / (`N_clu2'- 1) ))
	local dof12 = 1 / (( (`M_clu12'-1) / (`M_clu12') ) * ( (`N_clu12'-`K_clu12') / (`N_clu12'- 1) ))

	matrix `tmp1' = `vcv_clu1' * ( (`M_clu1'-1) / (`M_clu1') ) * ( (`N_clu1'-`K_clu1') / (`N_clu1'- 1) )
	matrix `tmp2' = `vcv_clu2' * ( (`M_clu2'-1) / (`M_clu2') ) * ( (`N_clu2'-`K_clu2') / (`N_clu2'- 1) )
	matrix `tmp12' = `vcv_clu12' * ( (`M_clu12'-1) / (`M_clu12') ) * ( (`N_clu12'-`K_clu12') / (`N_clu12'- 1) )

matrix mytwoway = `vcv_clu1' + `vcv_clu2' - `vcv_clu12' 
matrix mytwoway_noDOF = `tmp1' + `tmp2' - `tmp12'  

	matrix `rawvhat' = `tmp1' + `tmp2' - `tmp12' + vhat_dyadpart

matrix rawvhat = `rawvhat'
	
	matrix rownames `rawvhat' = `RHSvars_tmp'
	matrix colnames `rawvhat' = `RHSvars_tmp'
	
	/* compute and multiply by degrees of freedom correction */
	/* 2014-12-23:  num_underlying is total # groups … instead think of that – 1 as the base … so use (ngroups-1)/(ngroups-2) … maybe revisit this decision later */

	//dofcorrection = (numobs-1) * (numgroups-1) / (numobs-numRHSvars) / (numgroups-2)

	local dofcorrection = (`N'-1) * (`num_underlying' - 1) / (`N' - `k') / (`num_underlying' - 2)
//noi di "DOF corrections for way-1, way-2, and way-12: `dof1';  `dof2';  `dof12'" 
//noi di "DOF correction for dyad: `dofcorrection'"

	matrix `vhat' = `rawvhat' * `dofcorrection'

matrix mainvhat = `vhat'

	/*  Do Eigenfix */
	*use mata to get eigenvalues after ensuring that variance matrix is (numerically) symmetric
	tempname eigenvalues eigenvectors

	mata { 
		B = st_matrix("`rawvhat'") 
		A = makesymmetric(B) 
		symeigensystem(A, C=., lamda=.) 
  		st_matrix("`eigenvalues'", lamda) 
		st_matrix("`eigenvectors'", C)
	}

	//matrix list `eigenvalues' 
	
	local rnames  : rownames `vhat'
	local numcols = colsof(`vhat')
	local eigenfix "no"
	forvalues col=1/`numcols' { /* column number loop */
		if (`eigenvalues'[1,`col']<0) {

		    	mat `eigenvalues'[1,`col']=0
		    	local eigenfix "yes"
		}
	} /* end column number loop */
	
	*now reconstruct variance matrix using spectral decomposition formula (e.g., Def A.16 in Greene, 6th)
	tempname raw_running_sum
	mat `raw_running_sum' = `vhat'	/* pre eigen-fix variance matrix */

	if "`nodof'" == "nodof" { 
		local dofcorrection = 1 
	} 
	
	mat `vhat' = `eigenvectors'*diag(`eigenvalues')*`eigenvectors'' * `dofcorrection'
	mat rownames `vhat' = `rnames'
	mat colnames `vhat' = `rnames'

	if "`noeigenfix'"=="noeigenfix" & "`eigenfix'" == "yes"{
		
		tempname missing
		matrix `missing' = J(rowsof(`vhat'),colsof(`vhat'),.)
		matrix `vhat' = `missing'
	
		di
		di " -> NOTE: Raw estimated variance matrix was non positive semi-definite."
		di
		di "     Because you used the -noeigenfix- option, 'missing' variance matrix used."
		di
	}

	/* end checking/fixing non-psd variance estimate */

	//  delete these lines later
	//  keep the 1,1 element of VCV matrix, for initial simulations
	local var_11_alt = `vhat'[1,1]
	local se_11_alt = sqrt(`var_11_alt')
	
	/* end checking/fixing non-psd variance estimate */

	

	/* final cleanup and post */
	di
	di _column(40) "Number of dyadic obs            =  `n'"
	di _column(40) "Number of underlying obs        =  `num_underlying'"

	if "`if'"~="" di _column(40) "If condition      =    `if'"
	if "`in'"~="" di _column(40)     "In condition      =    `in'"
	if "`weight'"~="" di _column(40) "Weights are       =    `weight'"
	di

matrix mainvhat2 = `vhat'

	ereturn post `b' `vhat', e(`touse') depname(`depname') 
	ereturn local ratio_all_non0	 	`ratio_all_nonzero'
	ereturn local num_underlying		`num_underlying'
	ereturn local se_11_alt				`se_11_alt'
	
	ereturn scalar N       =           	`n'
	
	ereturn local cmd =               "regdyad"
	ereturn display

	restore 

end


mata 

// Step 5.0.  Create list of observations to sum up, and sum them up.

void mataest(betapointer, resid, dyads) { 

//	5.1.  First read in data into matricies and scalars, create score matrix, etc.


	RHSvarnames = (st_matrixcolstripe(betapointer)[.,2])'
	st_view(re=., .,tokens(resid))  
	st_view(X=., .,RHSvarnames[1,1..cols(RHSvarnames)-1])  
	st_view(idvars=., .,tokens(dyads))  

	id1 = idvars[.,1]
	id2 = idvars[.,2]
	idlist = uniqrows(id1\id2)
	
	obsid = runningsum(J(rows(X),1,1))		// internal id for each dyadic observation

	// create score matrix
	X = (X,J(rows(X),1,1))				// add a constant
	expanded_resid = re * J(1,cols(X),1)  	// multiply by e(1,k)
	Z = X :* expanded_resid				// Z is the score

	numgroups = rows(idlist)
	numobs = rows(Z)
	numRHSvars = cols(Z)

	middle = J(numRHSvars,numRHSvars,0)


//	5.2	Create the list of observations to sum up.  To do this, go through a list of group ids.  For each group id, collect
//		all pairs of observations that have that group in common, in the opposite-node location only,
//		and then add them to this list.

	i_g = 1
	while(i_g<=numgroups) {
		// identify wich observations to pull out for running sum

		match_id1 = (id1 :== (J(numobs,1,1)*idlist[i_g]))	// idlist[i] gives the group id we are considering.
		match_id2 = (id2 :== (J(numobs,1,1)*idlist[i_g]))

		// exclude from consideration observations which have same id1 as id2.  To implement:
		// match_id1 works, and match_id2 does not.  (or vice versa for toselect2)
		toselect1 = match_id1 :* (J(numobs,1,1) - match_id2)
		toselect2 = match_id2 :* (J(numobs,1,1) - match_id1)
	
		group1_ids = select(obsid,toselect1)				// note: observation IDs, not country IDs
		group2_ids = select(obsid,toselect2)				// note: observation IDs, not country IDs
		
		// do a double sum over the elements in group1_ids and group2_ids, add to middle part of VCV matrix

		i_1 = 1
		while(i_1 <= rows(group1_ids)) {
			index1 = group1_ids[i_1,1]
			
			i_2 = 1
			while(i_2 <= rows(group2_ids)) {
				index2 = group2_ids[i_2,1]
				// The list just gets one half of the VCV matrix.  So add the opposite half too -- symmetric! 
				middle = middle + Z[index1,.]' * Z[index2,.] + Z[index2,.]' * Z[index1,.] 

// the next line will list out all the elements being added to the matrix...
//(index1,index2)

			i_2++
			}
		
		i_1++
		}
		
	i_g++
	}


//	5.5.	Create the VCV matrix.

	XpXinv = invsym(X'*X)
	
	//dofcorrection = (numobs-1) * numgroups / (numobs-numRHSvars) / (numgroups-1)

	vhat = XpXinv * middle * XpXinv 
	vhat_symmetric = makesymmetric(vhat) 

	
	//	5.6.	Post the results back to the main stata program.

	st_matrix("XpXinv",XpXinv)
	st_matrix("raw_dyad_middle",middle)
	st_matrix("vhat_dyadpart",vhat)
	st_matrix("numclu",numgroups)

} 


end