/* cgmdyad_tmp.ado - modification from 5/7/2012 version of cgmdyad.ado, trying to implement new method that can deal with large data sets. basic idea: 1. do 2-way robust 2. go into mata, create a list of the "dyad only" cells in the middle of the VCV matrix. Add up over those cells, then pass this back to stata add this to the 2-way robust matrix. 3. do the eigenfix routine */ cap prog drop regdyad2 program define regdyad2, eclass byable(onecall) sortpreserve syntax anything [if] [in] [aweight fweight iweight pweight /], Dyads(string) [NOEIGenfix NODOF JACKresid *] /* deal with weights */ if "`weight'"~="" { local weight "[`weight'=`exp']" } else { local weight "" } /* main regression */ qui regress `anything' `if' `in' `weight', `options' /* copy some information that regress provides */ tempvar touse qui gen `touse' = e(sample) /* note that this will take care of zero-weight cases */ tempname b v_naive mat `b' = e(b) mat `v_naive' = e(V) local n = e(N) local N = e(N) local depname = e(depvar) local RHSvars_tmp : rowfullnames e(V) local RHSvars = regexr("`RHSvars_tmp'","_cons","") /* remove "_cons" */ local k = e(df_m) + 1 /* generate the residuals */ tempvar resid usedinreg groupvar qui predict double `resid' if `touse'==1, residual local dyad_id1 = word("`dyads'",1) local dyad_id2 = word("`dyads'",2) /* if "jackresid" option selected, jackknife over nodes to get predicted residuals... then use these */ if "`jackresid'" == "jackresid" { tempvar resid_jk1 resid_jk2 resid_avg tempfile analysisdata qui gen `resid_jk1' = . qui gen `resid_jk2' = . qui save `analysisdata' // make a list of nodes. To do this, // string node lists along, dyads1 then dyads2. Then elimitate duplicates. tempfile nodelist tempvar node_id stack `dyad_id1' `dyad_id2' , into(`node_id') clear contract `node_id' keep `node_id' sort `node_id' qui summ `node_id' local num_nodes = r(N) qui save `nodelist' , replace // make a loop to load up the node_ids into a bunch of local macros forvalues i = 1/`num_nodes' { local node_item_`i' = `node_id'[`i'] } // loop to go through nodes list forvalues i = 1/`num_nodes' { qui use `analysisdata' , replace qui drop if `dyad_id1' == `node_item_`i'' | `dyad_id2' == `node_item_`i'' // the Jackknife step, drop node i. qui regress `anything' `if' `in' `weight', `options' tempvar resid_junk qui use `analysisdata' , replace qui predict double `resid_junk' , resid qui replace `resid_jk1' = `resid_junk' if `dyad_id1' == `node_item_`i'' qui replace `resid_jk2' = `resid_junk' if `dyad_id2' == `node_item_`i'' drop `resid_junk' qui save `analysisdata' , replace list `resid' `resid_jk1' `resid_jk2' } // end of i loop through nodes gen `resid_avg' = (`resid_jk1' + `resid_jk2') / 2 // summ // list `resid' `resid_avg' `resid_jk1' `resid_jk2' qui replace `resid' = `resid_avg' } // end of "if jackresid" /* cluster 3 times: way 1, way 2, way1-by-way2. Each time, take the VCV, un-do the D.O.F. correction, and keep the matrix */ /* create a new cluster variable which combines the two dyad variables; for combination */ qui egen `groupvar' = group(`dyads') if `touse' *noi list `dyad_id1' `dyad_id2' `groupvar' in 1/30 qui regress `anything' `if' `in' `weight', `options' cluster(`dyad_id1') /* obtain the VCV matrix, and Stata's M, N, and K: #clusters, #obs, #covars */ tempname vcv_clu1 vcv_clu2 vcv_clu12 matrix `vcv_clu1' = e(V) local N_clu1 = e(N) local M_clu1 = e(N_clust) local K_clu1 = e(df_m) qui regress `anything' `if' `in' `weight', `options' cluster(`dyad_id2') matrix `vcv_clu2' = e(V) local N_clu2 = e(N) local M_clu2 = e(N_clust) local K_clu2 = e(df_m) qui regress `anything' `if' `in' `weight', `options' cluster(`groupvar') matrix `vcv_clu12' = e(V) local N_clu12 = e(N) local M_clu12 = e(N_clust) local K_clu12 = e(df_m) preserve qui keep if `touse' == 1 /* Step 4: call the mata function that creates the "cross-dyad" elements of the VCV matrix */ noi mata: mataest("`b'","`resid'","`dyads'") /* noi for debugging, qui for operation */ local num_underlying = numclu[1,1] /* combine matavhat, and the 3 elements from the 3 one-way clustering to form Dyad-VCV matrix */ /* the DOF correction is {M/(M-1)}*{(N-1)/N-K)}. So to undo the correction, multiply by: {(M-1)/M}*{(N-K)/N-1)}. */ tempname vhat rawvhat tmp1 tmp2 tmp12 local dof1 = 1 / (( (`M_clu1'-1) / (`M_clu1') ) * ( (`N_clu1'-`K_clu1') / (`N_clu1'- 1) )) local dof2 = 1 / (( (`M_clu2'-1) / (`M_clu2') ) * ( (`N_clu2'-`K_clu2') / (`N_clu2'- 1) )) local dof12 = 1 / (( (`M_clu12'-1) / (`M_clu12') ) * ( (`N_clu12'-`K_clu12') / (`N_clu12'- 1) )) matrix `tmp1' = `vcv_clu1' * ( (`M_clu1'-1) / (`M_clu1') ) * ( (`N_clu1'-`K_clu1') / (`N_clu1'- 1) ) matrix `tmp2' = `vcv_clu2' * ( (`M_clu2'-1) / (`M_clu2') ) * ( (`N_clu2'-`K_clu2') / (`N_clu2'- 1) ) matrix `tmp12' = `vcv_clu12' * ( (`M_clu12'-1) / (`M_clu12') ) * ( (`N_clu12'-`K_clu12') / (`N_clu12'- 1) ) matrix mytwoway = `vcv_clu1' + `vcv_clu2' - `vcv_clu12' matrix mytwoway_noDOF = `tmp1' + `tmp2' - `tmp12' matrix `rawvhat' = `tmp1' + `tmp2' - `tmp12' + vhat_dyadpart matrix rawvhat = `rawvhat' matrix rownames `rawvhat' = `RHSvars_tmp' matrix colnames `rawvhat' = `RHSvars_tmp' /* compute and multiply by degrees of freedom correction */ /* 2014-12-23: num_underlying is total # groups … instead think of that – 1 as the base … so use (ngroups-1)/(ngroups-2) … maybe revisit this decision later */ //dofcorrection = (numobs-1) * (numgroups-1) / (numobs-numRHSvars) / (numgroups-2) local dofcorrection = (`N'-1) * (`num_underlying' - 1) / (`N' - `k') / (`num_underlying' - 2) //noi di "DOF corrections for way-1, way-2, and way-12: `dof1'; `dof2'; `dof12'" //noi di "DOF correction for dyad: `dofcorrection'" matrix `vhat' = `rawvhat' * `dofcorrection' matrix mainvhat = `vhat' /* Do Eigenfix */ *use mata to get eigenvalues after ensuring that variance matrix is (numerically) symmetric tempname eigenvalues eigenvectors mata { B = st_matrix("`rawvhat'") A = makesymmetric(B) symeigensystem(A, C=., lamda=.) st_matrix("`eigenvalues'", lamda) st_matrix("`eigenvectors'", C) } //matrix list `eigenvalues' local rnames : rownames `vhat' local numcols = colsof(`vhat') local eigenfix "no" forvalues col=1/`numcols' { /* column number loop */ if (`eigenvalues'[1,`col']<0) { mat `eigenvalues'[1,`col']=0 local eigenfix "yes" } } /* end column number loop */ *now reconstruct variance matrix using spectral decomposition formula (e.g., Def A.16 in Greene, 6th) tempname raw_running_sum mat `raw_running_sum' = `vhat' /* pre eigen-fix variance matrix */ if "`nodof'" == "nodof" { local dofcorrection = 1 } mat `vhat' = `eigenvectors'*diag(`eigenvalues')*`eigenvectors'' * `dofcorrection' mat rownames `vhat' = `rnames' mat colnames `vhat' = `rnames' if "`noeigenfix'"=="noeigenfix" & "`eigenfix'" == "yes"{ tempname missing matrix `missing' = J(rowsof(`vhat'),colsof(`vhat'),.) matrix `vhat' = `missing' di di " -> NOTE: Raw estimated variance matrix was non positive semi-definite." di di " Because you used the -noeigenfix- option, 'missing' variance matrix used." di } /* end checking/fixing non-psd variance estimate */ // delete these lines later // keep the 1,1 element of VCV matrix, for initial simulations local var_11_alt = `vhat'[1,1] local se_11_alt = sqrt(`var_11_alt') /* end checking/fixing non-psd variance estimate */ /* final cleanup and post */ di di _column(40) "Number of dyadic obs = `n'" di _column(40) "Number of underlying obs = `num_underlying'" if "`if'"~="" di _column(40) "If condition = `if'" if "`in'"~="" di _column(40) "In condition = `in'" if "`weight'"~="" di _column(40) "Weights are = `weight'" di matrix mainvhat2 = `vhat' ereturn post `b' `vhat', e(`touse') depname(`depname') ereturn local ratio_all_non0 `ratio_all_nonzero' ereturn local num_underlying `num_underlying' ereturn local se_11_alt `se_11_alt' ereturn scalar N = `n' ereturn local cmd = "regdyad" ereturn display restore end mata // Step 5.0. Create list of observations to sum up, and sum them up. void mataest(betapointer, resid, dyads) { // 5.1. First read in data into matricies and scalars, create score matrix, etc. RHSvarnames = (st_matrixcolstripe(betapointer)[.,2])' st_view(re=., .,tokens(resid)) st_view(X=., .,RHSvarnames[1,1..cols(RHSvarnames)-1]) st_view(idvars=., .,tokens(dyads)) id1 = idvars[.,1] id2 = idvars[.,2] idlist = uniqrows(id1\id2) obsid = runningsum(J(rows(X),1,1)) // internal id for each dyadic observation // create score matrix X = (X,J(rows(X),1,1)) // add a constant expanded_resid = re * J(1,cols(X),1) // multiply by e(1,k) Z = X :* expanded_resid // Z is the score numgroups = rows(idlist) numobs = rows(Z) numRHSvars = cols(Z) middle = J(numRHSvars,numRHSvars,0) // 5.2 Create the list of observations to sum up. To do this, go through a list of group ids. For each group id, collect // all pairs of observations that have that group in common, in the opposite-node location only, // and then add them to this list. i_g = 1 while(i_g<=numgroups) { // identify wich observations to pull out for running sum match_id1 = (id1 :== (J(numobs,1,1)*idlist[i_g])) // idlist[i] gives the group id we are considering. match_id2 = (id2 :== (J(numobs,1,1)*idlist[i_g])) // exclude from consideration observations which have same id1 as id2. To implement: // match_id1 works, and match_id2 does not. (or vice versa for toselect2) toselect1 = match_id1 :* (J(numobs,1,1) - match_id2) toselect2 = match_id2 :* (J(numobs,1,1) - match_id1) group1_ids = select(obsid,toselect1) // note: observation IDs, not country IDs group2_ids = select(obsid,toselect2) // note: observation IDs, not country IDs // do a double sum over the elements in group1_ids and group2_ids, add to middle part of VCV matrix i_1 = 1 while(i_1 <= rows(group1_ids)) { index1 = group1_ids[i_1,1] i_2 = 1 while(i_2 <= rows(group2_ids)) { index2 = group2_ids[i_2,1] // The list just gets one half of the VCV matrix. So add the opposite half too -- symmetric! middle = middle + Z[index1,.]' * Z[index2,.] + Z[index2,.]' * Z[index1,.] // the next line will list out all the elements being added to the matrix... //(index1,index2) i_2++ } i_1++ } i_g++ } // 5.5. Create the VCV matrix. XpXinv = invsym(X'*X) //dofcorrection = (numobs-1) * numgroups / (numobs-numRHSvars) / (numgroups-1) vhat = XpXinv * middle * XpXinv vhat_symmetric = makesymmetric(vhat) // 5.6. Post the results back to the main stata program. st_matrix("XpXinv",XpXinv) st_matrix("raw_dyad_middle",middle) st_matrix("vhat_dyadpart",vhat) st_matrix("numclu",numgroups) } end