#Two-line test app 0.52 # Last update 2018 11 23 # Written by Uri Simonsohn (urisohn@gmail.com) # This is the exact code behind the two-line online app (http://webstimate.org/twolines/) # If you see any errors or have questions people contact me directly. # #################################################################################### # #WARNING: THIS WILL INSTALL PACKAGES IF YOU DON'T HAVE THEM list.of.packages <- c("mgcv", "stringr","sandwich","lmtest") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) rm(list=ls()) # clean it all library(mgcv) #This library has the additive model with smoothing function library(stringr) #To process strings in the function library(sandwich) #For robust standard errors library(lmtest) #To run linear tests #OUTLINE #Function 1 - rp(p) Reformat p-value for printing on chart #Function 2 - eval2() Reads a variable name and assigns the values to another variable (useful for processing formulas and creating temporary variables) #Function 3 - reg2() Interrupted regression with entered formula plus cutoff point #Function 4 - twolines() Run two-lines test, relying on functions 1 and 2 as well ################################################################################ #Function 1 - rp(p) Reformat p-value for printing on chart rp=function(p) if (p<.0001) return("p<.0001") else return(paste0("p=", sub("^(-?)0.", "\\1.", sprintf("%.4f", p)))) #Function 2 - eval2() evaluate a string as a function eval2=function(string) eval(parse(text=string)) #Function that evaluates a variable" #if x.f="x1", then x=eval2(x.f) will populate x with the values of x1 #Function 3 - two-line regression with glm() so that it has heteroskedastic robust standard errors reg2=function(f,xc,graph=1,family="gaussian") { #Syntax: #f: formula as in y~x1+x2+x3 #The first predictor is the one the u-shape is tested on #xc: where to set the breakpoint, a number #link: # Gaussian for OLS # binomial for probit #(1) Extract variable names from formula #1.1 Get the formulas y.f=all.vars(f)[1] #DV x.f=all.vars(f)[2] #Variable on which the u-shape shall be tested #Number of variables var.count=length(all.vars(f)) #How many total variables, including y and x #Entire model, except the first predictor if (var.count>2) nox.f=drop.terms(terms(f),dropx=1,keep.response = T) #1.2 Grab the two key variables to be used, xu and yu xu=eval2(x.f) #xu is the key predictor predicted to be u-shaped yu=eval2(y.f) #yu is the dv #1.3 Replace formula for key predictor so that it accommodates possibly discrete values, gam() breaks down if x has few possible values unless one restricts it, done automatically #1.3.1 Count number of unique x values unique.x=length(unique(xu)) #How many unique values x has #1.3.2 New function segment for x sx.f=paste0("s(",x.f,",bs='cr', k=min(10,",unique.x,"))" ) #1.4 xc is included in the first line xlow1 =ifelse(xu<=xc,xu-xc,0) #xlow=x-xc when x<xc, 0 otherwise xhigh1=ifelse(xu>xc,xu-xc,0) #xhigh=x when x<xmax, 0 otherwise high1 =ifelse(xu>xc,1,0) #high dummy, allows interruption #1.5 Now include xc in second line xlow2=ifelse(xu<xc,xu-xc,0) xhigh2=ifelse(xu>=xc,xu-xc,0) high2=ifelse(xu>=xc,1,0) #(2) Run interrupted regressions #2.1 Generate formulas replacing the single predictor x, with the 3 new variables #If there were covariates, grab them an copy-paste them after the 3 new variables if (var.count>2) { glm1.f=update(nox.f,~ xlow1+xhigh1+high1+.) #update takes a formula and adds elements to it, by putting the . at the end, it will paste the existing variables after the new 3 variables glm2.f=update(nox.f,~ xlow2+xhigh2+high2+.) } #If there were no covariates, just run the 3 variable model if (var.count==2) { glm1.f=as.formula("yu~ xlow1+xhigh1+high1") glm2.f=as.formula("yu~ xlow2+xhigh2+high2") } #2.2 Run them glm1=glm(as.formula(format(glm1.f)),family=family) glm2=glm(as.formula(format(glm2.f)),family=family) #2.3 Compute robust standard errors rob1=coeftest(glm1, vcov=vcovHC(glm1,"HC3")) #Until 2018 03 20 I was using HC3, but they sometimes generate errors, so i switched it to HC1 rob2=coeftest(glm2, vcov=vcovHC(glm2,"HC3")) #Sometimes HC3 gives NA values (for very sparse or extreme data), check and if that's the case change method msg="" if (is.na(rob1[2,4])) { rob1=coeftest(glm1, vcov=vcovHC(glm1,"HC1")) msg=paste0(msg,"\nFor line 1 the heteroskedastic standard errors HC3 resulted in an error thus we used HC1 instead.") } if (is.na(rob2[2,4])) { rob2=coeftest(glm2, vcov=vcovHC(glm2,"HC1")) msg=paste0(msg,"\nFor line 2 the heteroskedastic standard errors HC3 resulted in an error thus we used HC1 instead.") } #2.4 Slopes b1=as.numeric(rob1[2,1]) b2=as.numeric(rob2[3,1]) #2.5 Test statistics, z-values z1=as.numeric(rob1[2,3]) z2=as.numeric(rob2[3,3]) #2.6 p-values p1=as.numeric(rob1[2,4]) p2=as.numeric(rob2[3,4]) #3) Is the u-shape significant? u.sig =ifelse(b1*b2<0 & p1<.05 & p2<.05,1,0) #4) Plot results if (graph==1) { #4.1 General colors and parameters pch.dot=1 #Dot for scatterplot (data) #col.l1='blue2' #Color of straight line 1 col.l1='dodgerblue3' col.l2='firebrick' #Color of straight line 2 col.fit='gray50' #Color of fitted smooth line col.div="green3" #Color of vertical line lty.l1=1 #Type of line 1 lty.l2=1 #Type of line 2 lty.fit=2 #Type of smoothed line #4.2) Estimate smoother if (var.count>2) { gam.f=paste0(format(nox.f),"+",sx.f) #add the modified smoother version of x into the formula gams=gam(as.formula(gam.f),link=link) #now actually run the smoother } if (var.count==2) { gams=gam(as.formula(paste0("yu~",sx.f)),link=link) #now actually run the smoother } #Get dots of raw data #4.3) If no covariates, there are two variables, and y.dots is the y values if (var.count==2) yobs=yu #4.4) If covariates present, yobs is the fitted value with u(x) at mean, need new.data() with variables at means if (var.count>2) { #4.4.1) Put observed data into matrix data.obs=as.data.frame(matrix(nrow=length(xu),ncol=var.count)) colnames(data.obs)=all.vars(f) #4.4.2 Populate the dataset with the observed variables for (i in 1:(var.count)) data.obs[,i]=eval(as.name(all.vars(f)[i])) #4.4.3) Drop observations with missing values on any of the variables data.obs=na.omit(data.obs) #4.4.4) Create data where u(x) is at sample means to get residuals based on rest of models to act as yobs #Recall: columns 1 & 2 have y and u(x) in obs.data data.xufixed =data.obs data.xufixed[,2]=mean(data.obs[,2]) #Note, the 1st predictor, 2nd columns, is always the one hypothesized to be u-shaped #replace it with the mean value of the predictor #5.4.5) Create data where u(x) is obs, and all else at sample means data.otherfixed = data.obs #start with original value #Replace all RHS with mean, except the u(x) #for (i in 3:var.count) data.otherfixed[,i]=mean(data.obs[,i]) #changed on 2018 11 23 to allow having factors() as predictors, their "midpoint" value is used, sometimes that value is more meaningfully a median point than others for (i in 3:var.count) { #loop over covariates xt=sort(data.obs[,i]) #create auxiliary variable that has those values sorted n=length(xt) #see how many observations there are xm=xt[round(n/2,0)] #take the midpoint value (this will work with ordinal and factor data, but with factor it can be arbitrary) data.otherfixed[,i]=xm #Replace with that value in the dataset used for fitted value } #4.4.6) Get yobs with covariates #First the fitted value yhat.xufixed=predict.gam(gams,newdata = data.xufixed) #Substract fitted value from observed y, and shift it with constant so that it has same mean as original y yobs =yu-yhat.xufixed yobs=yobs+mean(yu)-mean(yobs) #Adjust to have the same mean } #End if for covariates that requires computes y.obs instead of using real y. #4.5) First line (x,y) coordinates # offset1=mean(yobs[xu<=xc])-min(xu)*b1-(xc-min(xu))/2*b1 # x.l1=c(min(xu),xc) # y.l1=c(min(xu)*b1+offset1,xc*b1+offset1) # # #4.6) First line (x,y) coordinates # offset2=mean(yobs[xu>=xc])-xc*b2-(max(xu)-xc)/2*b2 # x.l2=c(xc,max(xu)) # y.l2=c(xc*b2+offset2,max(xu)*b2+offset2) #4.7) Get yhat.smooth #Without covariates, just fit the observed data if (var.count==2) yhat.smooth=predict.gam(gams) #With covariates, fit at observed means if (var.count>2) yhat.smooth=predict.gam(gams,newdata = data.otherfixed) #Substract fitted value from observed y offset3 = mean(yobs-yhat.smooth) yhat.smooth=yhat.smooth+offset3 #4.8) Coordinates for top and bottom end of chart y1 =max(yobs,yhat.smooth) #highest point y0 =min(yobs,yhat.smooth) #lowest point yr =y1-y0 #range y0 =y0-.3*yr #new lowest. 30% lower #xs x1 =max(xu) x0 =min(xu) xr =x1-x0 #4.9) Plot #4.9.1) Figure out coordinates for arrows so that they fit par(mar=c(5.4,4.1,.5,2.1)) plot(xu[xu<xc],yobs[xu<xc],cex=.75,col=col.l1,pch=pch.dot,las=1, ylim=c(y0,y1), xlim=c(min(xu),max(xu)), xlab="", ylab="") #Range of y has extra 30% to add labels points(xu[xu>xc],yobs[xu>xc],cex=.75, col=col.l2) #Axis labels mtext(side=1,line=2.75,x.f,font=2) mtext(side=2,line=2.75,y.f,font=2) #4.10) Smoothed line #lines(xu[order(xu)],yhat.smooth[order(xu)],col=col.fit,lwd=2,lty=lty.fit) #4.10 - New 2018 05 25 lines(xu[order(xu)],yhat.smooth[order(xu)],col=col.fit,lty=2,lwd=2) #4.10.2 Arrow 1 xm1=(xc+x0)/2 x0.arrow.1=xm1-.1*xr x1.arrow.1=xm1+.1*xr y0.arrow.1=y0+.1*yr y1.arrow.1=y0+.1*yr+ b1*(x1.arrow.1-x0.arrow.1) #Move arrow if it is too short if (x0.arrow.1<x0+.1*xr) x0.arrow.1=x0 #Move arrow if it covers text gap.1=(min(y0.arrow.1,y1.arrow.1)-(y0+.1*yr)) if (gap.1<0) { y0.arrow.1=y0.arrow.1-gap.1 y1.arrow.1=y1.arrow.1-gap.1 } arrows(x0=x0.arrow.1,x1=x1.arrow.1,y0=y0.arrow.1,y1=y1.arrow.1,col=col.l1,lwd=2) #4.10.3 Text under arrow 1 xm1=max(xm1,x0+.20*xr) text(xm1,y0+.025*yr, paste0("Average slope 1:\nb = ",round(b1,2),", z = ",round(z1,2),", ",rp(p1)),col=col.l1) #4.10.3 Arrow 2 x0.arrow.2=xc+(x1-xc)/2-.1*xr x1.arrow.2=xc+(x1-xc)/2+.1*xr y0.arrow.2=y1.arrow.1 y1.arrow.2=y0.arrow.2 + b2*(x1.arrow.2-x0.arrow.2) gap.2=(min(y0.arrow.2,y1.arrow.2)-(y0+.1*yr)) if (gap.2<0) { y0.arrow.2=y0.arrow.2-gap.2 y1.arrow.2=y1.arrow.2-gap.2 } #Shorten arrow if it is too close to the end x1.arrow.2=min(x1.arrow.2,x1) if (x0.arrow.2<xc) x0.arrow.2=xc xm2=xc+(x1-xc)/2 xm2=min(xm2,x1-.2*xr) arrows(x0=x0.arrow.2,x1=x1.arrow.2,y0=y0.arrow.2,y1=y1.arrow.2,col=col.l2,lwd=2) text(xm2,y0+.025*yr, paste0("Average slope 2:\nb = ",round(b2,2),", z = ",round(z2,2),", ",rp(p2)),col=col.l2) #4.13 Division line lines(c(xc,xc),c(y0+.35*yr,y1),col=col.div,lty=lty.fit) text(xc,y0+.3*yr,round(xc,2),col=col.div) }#End: if graph==1 #5 list with results res=list(b1=b1,p1=p1,b2=b2,p2=p2,u.sig=u.sig,xc=xc,z1=z1,z2=z2, glm1=glm1,glm2=glm2,rob1=rob1,rob2=rob2,msg=msg) #Output list with all those parameters, betas, z-values, p-values and significance for u if (graph==1) res$yhat.smooth=yhat.smooth #output it res } #End of reg2() function #Function 4- twolines=function(f,graph=1,link="gaussian",data=NULL,pngfile="") { attach(data) #(1) Extract variable names #1.1 Get the formulas y.f=all.vars(f)[1] #DV x.f=all.vars(f)[2] #Variable on which the u-shape shall be tested #Number of variables var.count=length(all.vars(f)) #How many predictors in addition to the key predictor? #Entire model, except the first predictor if (var.count>2) nox.f=drop.terms(terms(f),dropx=1,keep.response = T) #1.1.5 Drop missing value for any the variables being used #all variables in the regression vars=all.vars(f) #Vector with columns associated with those variable names inthe uploaded dataset cols=c() for (var in vars) cols=c(cols, which(names(data)==var)) #Set of complete observations full.rows=complete.cases(data[,cols]) #Drop missing rows data=data[full.rows,] detach(data) #Detach the full dataset attach(data) #Attach the one without missing values in key variables #1.2 Grab the two key variables to be used, xu and yu xu=eval2(x.f) #xu is the key predictor predicted to be u-shaped yu=eval2(y.f) #yu is the dv #1.3 Replace formula for key predictor so that it accommodates possibly discrete values #1.3.1 Count number of unique x values unique.x=length(unique(xu)) #How many unique values x has #1.3.2 New function segment for x sx.f=paste0("s(",x.f,",bs='cr', k=min(10,",unique.x,"))" ) #2 Run smoother #2.1 Define the formula to be run based on whether there are covariates if (var.count>2) gam.f=paste0(format(nox.f),"+",sx.f) #with covariates if (var.count==2) gam.f=paste0("yu~",sx.f) #without #2.2 Now run it gams=gam(as.formula(gam.f),link=link) #so this is a general additive model with the main specification entered #but we make the first predictor, the one that will be tested for having a u-shaped effect #be estimated with a completely flexible functional form. #(3) Generate yobs (dots) #3.1 If no covariates, yobs is the actually observed data if (var.count==2) yobs=yu #3.2 If covariates present, yobs is the fitted value with u(x) at mean, need new.data() with variables at means if (var.count>2) { #3.3 Put observed data into matrix data.obs=as.data.frame(matrix(nrow=length(xu),ncol=var.count)) #Empty datafile colnames(data.obs)=all.vars(f) #Name variables for (i in 1:(var.count)) data.obs[,i]=eval(as.name(all.vars(f)[i])) #fill in data #3.4 Drop observations with missing values on any of the variables data.obs=na.omit(data.obs) #3.5 Create data where xu is at sample means to get residuals based on rest of models to act as yobs #Recall: columns 1 & 2 have y and u(x) in obs.data data.xufixed =data.obs data.xufixed[,2]=mean(data.obs[,2]) #Note, the 1st predictor, 2nd columns, is always the one hypothesized to be u-shaped #replace it with the mean value of the predictor #3.6 Get yobs with covariates #First the fitted value ##add the modified smoother version of x into the formula yhat.xufixed=predict.gam(gams,newdata = data.xufixed) #get fitted values at means for covariates #3.7 Substract fitted value from observed y yobs = yu-yhat.xufixed #3.8 Create data where u(x) is obs, and all else at sample means data.otherfixed = data.obs #start with original value #3.9 Replace all covariates with their mean for fitting data at sample means for (i in 3:var.count) data.otherfixed[,i]=mean(data.obs[,i]) } #End if covariates are present to compute yobs #4) Get the fitted values at sample means for covariates #4.1) Get predicted values into list if (var.count>2) g.fit=predict.gam(gams,newdata = data.otherfixed,se.fit=TRUE) #predict with covariates at means if (var.count==2) g.fit=predict.gam(gams,se.fit=TRUE) #4.2) Take out the fitted itself y.hat=g.fit$fit #4.3) Now the SE y.se =g.fit$se.fit #5) Most extreme fitted value #5.1) Determine if function is at first decreasing (potential u-shape) vs. increaseing (potentially inverted U) (potential u-shape) orinverted u shaped using quadratic regression #to know if we are looking for max or min xu2=xu^2 #Square x term, xu is the 1st predictor re-cpded if (var.count>2) lmq.f=update(nox.f,~xu+xu2+.) #Add to function with covariates (put first, before covariates) if (var.count==2) lmq.f=yu~xu+xu2 # lmq=lm(as.formula(format(lmq.f))) #Estimate the quadratic regression bqs=lmq$coefficients #Get the point estimates bx1= bqs[2] #point estimate for effect of x bx2=bqs[3] #point estimate for effect of x^2 x0=min(xu) #lowest x-value s0=bx1+2*bx2*x0 #estimated slope at the lowest x-value if (s0>0) shape='inv-ushape' #if the quadratic is increasing at the lowest point, the could be inverted u-shape if (s0<=0) shape='ushape' #if it is decreaseing, then it could be a regular u-shape #5.2 Get the middle 80% of data to avoid an extreme cutoff x10=quantile(xu,.1) x90=quantile(xu,.9) middle=(xu>x10 & xu<x90) #Don't consider extreme values for cutoff x.middle=xu[middle] #5.3 Restrict y.hat to middle y.hat=y.hat[middle] y.se=y.se[middle] #5.4 Find upper and lower band y.ub=y.hat+y.se #+SE is for flat max y.lb=y.hat-y.se #-SE is for flat min #5.5 Find most extreme y-hat if (shape=='inv-ushape') y.most=max(y.hat) #if potentially inverted u-shape, use the highest y-hat as the most extrme if (shape=='ushape') y.most=min(y.hat) #if potential u-shaped, then the lowest instead #5.6 x-value associated with the most extreme value x.most=x.middle[match(y.most, y.hat)] #5.7 Find flat regions if (shape=='inv-ushape') flat=(y.ub>y.most) if (shape=='ushape') flat=(y.lb<y.most) xflat=x.middle[flat] #6 RUN TWO LINE REGRESSIONS #6.1 First an interrupted regression at the midpoint of the flat region rmid=reg2(f,xc=median(xflat),graph=0) #Two line regression at the median point of flat maximum #6.2 Get z1 and z2, statistical strength of both lines at the midpoint z1=abs(rmid$z1) z2=abs(rmid$z2) #6.3 Adjust breakpoint based on z1,z2 xc=quantile(xflat,z2/(z1+z2)) #6.4 Regression split based on adjusted based on z1,z2 #Save to png? (option set at the beggining by giving png a name) if (pngfile!="") png(pngfile, width=2000,height=1500,res=300) #Run the two lines res=reg2(as.formula(format(f)),xc=xc,graph=graph) #Save to png? (close) if (pngfile!="") dev.off() #7 Add other results obtained before to the output (some of these are read by the server and included in the app) res$yobs = yobs res$y.hat = y.hat res$y.ub = y.ub res$y.lb = y.lb res$y.most = y.most res$x.most = x.most res$f = format(f) res$bx1 = bx1 #linear effect in quadratic regression res$bx2 = bx2 #quadratic res$minx = min(xu) #lowest x value res$midflat = median(xflat) res$midz1 = abs(rmid$z1) res$midz2 = abs(rmid$z2) on.exit(detach(data)) res } #End function