# SUPPLEMENT A: R CODE FOR RELEVANT METHODS
# Note:A worked out example is provided at the end of this Supplement.
# ------------------------------------------------------------------------------
calc.g1 <- function(x) {
# Calculate g1 (unadjusted sample skewness)
# Args: x=vector of rawdata (this can be either X or Y)
# Returns: g1
n <- length(x)
(sum((x - mean(x))^3)/n)/(sum((x - mean(x))^2)/n)^(3/2)
}
calc.g2 <- function(x) {
# Calculate g2 (unadjusted sample kurtosis)
# Args: x=vector of rawdata (this can be either X or Y)
# Returns: g2
n <- length(x)
((sum((x - mean(x))^4)/n)/(sum((x - mean(x))^2)/n)^2) - 3
}
# The following short functions convert unadjusted estimates to adjusted
# estimates using unbiased cumulants. These equations were verified via Monte
# Carlo estimates of variance that matched those in Joanes & Gill (1998).
# Args: g1 or g2 (unadjusted sample skewness or kurtosis), n=sample size
# Returns: G1 or G2
calc.G1 <- function(g1, n) g1 * sqrt(n * (n - 1))/(n - 2)
calc.G2 <- function(g2, n) ((n - 1)/((n - 2) * (n - 3))) * ((n + 1) * g2 + 6)
calc.z.DAg.skew <- function(x) {
# Calculate the z value for D'Agostino et al. (1990) skewness test.
# This function was adapted from 'agostino.test' in 'moments' package
# version .13. However, log10 had to be changed to log on two lines so that
# the function matched the original D'Agostino equations. This change appears
# to be appropriate, as it makes the results match the sample problem in their
# article. Later versions of the 'moments' package (.14 or higher) have
# also corrected this bug.
# Args:x=vector of rawdata (this can be either X or Y)
# Returns:z-value, which is usually tested with a 2-tailed test. E.g., if
# |z|>1.96, then skewness is significantly different from 0 at p<.05.
n <- length(x)
if ((n < 8 || n > 46340))
stop("sample size must be between 8 and 46340")
s3 <- (sum((x - mean(x))^3)/n)/(sum((x - mean(x))^2)/n)^(3/2)
y <- s3 * sqrt((n + 1) * (n + 3)/(6 * (n - 2)))
b2 <- 3 * (n * n + 27 * n - 70) * (n + 1) * (n + 3)/
((n - 2) * (n + 5) * (n + 7) * (n + 9))
w <- sqrt(-1 + sqrt(2 * (b2 - 1)))
d <- 1/sqrt(log(w))
a <- sqrt(2/(w * w - 1))
z <- d * log(y/a + sqrt((y/a)^2 + 1))
z
}
calc.z.DAg.kurt <- function(x) {
# Calculate the z value for D'Agostino et al. (1990) kurtosis test.
# I also confirmed the accuracy of this function using the sample problem
# in D'Agostino et al. (1990). Note that the sampling distribution of the
# resulting z-value is usually treated as approximately normal for n>=20, but
# relaxing this requirement still provided useful results. That is, when n=10,
# the test of the z-value still did a good job of identifying situations where
# the Fisher Z CI method would provide poor coverage.
# Args:x=vector of rawdata (this can be either X or Y)
# Returns:z-value, which is usually tested with a 2-tailed test.
# E.g., if |z|>1.96, kurtosis is significantly different from 0 at p<.05.
n <- length(x)
b2 <- (sum((x - mean(x))^4)/n)/(sum((x - mean(x))^2)/n)^(2)
Eb2 <- 3 * (n - 1)/(n + 1)
Vb2 <- 24 * n * (n - 2) * (n - 3)/((n + 1) * (n + 1) * (n + 3) * (n + 5))
x2 <- (b2 - Eb2)/sqrt(Vb2)
sqB <- 6 * (n^2 - 5 * n + 2) * sqrt(6 * (n + 3) * (n + 5))/
((n + 7) * (n + 9) * sqrt(n * (n - 2) * (n - 3)))
A <- 6 + (8/sqB) * (2/sqB + sqrt(1 + 4/(sqB^2)))
z <- ((1 - 2/(9 * A)) -
((1 - 2/A)/(1 + x2 * sqrt(2/(A - 4))))^(1/3))/sqrt(2/(9 * A))
z
}
# Convert r to z'
# Args: r=sample Pearson correlation
# Returns: Fisher's Z (also known as z')
conv.r.to.z <- function(r) 0.5 * log((1 + r)/(1 - r))
# Convert z' to r
# Args:z= Fisher's Z (also known as z')
# Returns: sample Pearson correlation
conv.z.to.r <- function(z) (exp(z) - exp(-z))/(exp(z) + exp(-z))
calc.Fisher.z.CI <- function(r, n) {
# Calculate the Fisher Z Confidence Interval
# Args: r=sample Pearson correlation, n=sample size
# Returns: vector of Fisher Z 95% CI bounds
z <- conv.r.to.z(r)
zbounds <- z + c(qnorm(0.025), qnorm(0.975))/sqrt(n - 3)
rbounds <- conv.z.to.r(zbounds)
rbounds
}
calc.Spearman.CIs <- function(rs, n) {
# Calculate the Spearman Confidence Intervals
# Args: rs=sample Spearman correlation, n=sample size
# Returns: fiel=Fieller et al. 95% CI, bw=Bonett and Wright 95% CI
z <- conv.r.to.z(rs)
zfielbounds <- z + c(qnorm(0.025), qnorm(0.975)) * sqrt(1.06/(n - 3))
zbwbounds <- z + c(qnorm(0.025), qnorm(0.975)) * sqrt((1 + (rs^2)/2)/(n - 3))
rsfielbounds <- conv.z.to.r(zfielbounds)
rsbwbounds <- conv.z.to.r(zbwbounds)
list(fiel = rsfielbounds, bw = rsbwbounds)
}
transf.BoxCox <- function(lamb, yorig) {
# Transform via Box-Cox
# Args: lamb=lambda parameter, yorig=raw data vector (either X or Y)
# Returns: Box-Cox transformed data vector
y <- yorig - min(yorig) + 1.00001
if (lamb != 0)
tdata <- ((y^lamb) - 1)/lamb else tdata <- log(y)
tdata
}
transf.RIN <- function(x) {
# Transform via Rank Based Inverse Normal (RIN). Specifically, this is the
# Rankit version of the transformation.
# Args: x=vector of rawdata (this can be either X or Y)
# Returns: RIN normalized data vector
xranks <- rank(x)
tempp <- (xranks - 0.5)/(length(xranks))
qnorm(tempp)
}
# The next 3 functions allow for optimal Box-Cox transformation (optimizing the
# lambda parameter for normality). They also wrap up both RIN and Box-Cox
# transformation into a single function that takes an n by 2 matrix of the X and
# Y raw data.
calc.qqcor.1par <- function(lambda, skeweddata, funobj) {
tdata <- funobj(lambda, skeweddata)
qdata <- qqnorm(tdata, plot.it = F)$x
cor(tdata, qdata)
}
calc.qqcor <- function(tdata) {
qdata <- qqnorm(tdata, plot.it = F)$x
cor(tdata, qdata)
}
transf.gen <- function(datmat, funname, parrange = c(-5, 5)) {
funobj <- get(funname)
errsave <- 0
if (funname == "transf.BoxCox") {
modx <- optimize(calc.qqcor.1par, skeweddata = datmat[, 1], funobj = funobj,
interval = parrange, maximum = T)
mody <- optimize(calc.qqcor.1par, skeweddata = datmat[, 2], funobj = funobj,
interval = parrange, maximum = T)
tx <- funobj(modx$maximum, datmat[, 1])
ty <- funobj(mody$maximum, datmat[, 2])
lambx <- modx$maximum
lamby <- mody$maximum
} else if (funname == "transf.RIN") {
tx <- funobj(datmat[, 1])
ty <- funobj(datmat[, 2])
lambx <- NA
lamby <- NA
} else errsave <- 1
tr <- cor(tx, ty)
qqcorrx <- calc.qqcor(tx)
qqcorry <- calc.qqcor(ty)
if (errsave == 1) {
lambx <- NA
lamby <- NA
tr <- NA
qqcorrx <- NA
qqcorry <- NA
}
c(tr, lambx, lamby, qqcorrx, qqcorry)
}
# The following 5 functions are required for later bootstrapping functions to
# work properly. Some documentation is provided for interested programmers, but
# readers hoping to quickly construct confidence intervals are encouraged to
# skip ahead to the worked out example.
calc.theta.1 <- function(x, sampdat) {
cor(sampdat[x, 1], sampdat[x, 2])
}
calc.thetaOI <- function(x, OIBSframe, sampn) {
OIindex <- sample(x, sampn, replace = T)
cor(OIBSframe[OIindex, 1], OIBSframe[OIindex, 2])
}
standardize.vec <- function(vecvar) (vecvar - mean(vecvar))/sd(vecvar)
gen.OI.frame <- function(datmat, n = "") {
# Generate an Observed Imposed (OI) sampling frame from which to collect
# bootstrap samples for the OI bootstrap method.
# Args: datmat=an n by 2 matrix of sample data.
# Returns: samplingframe=matrix to sample from for OI bootstrap
x <- datmat[, 1]
y <- datmat[, 2]
r <- cor(x, y)
if (n == "")
n <- length(x)
rectfr <- cbind(rep(x, times = 1, each = n), rep(y, n))
stx <- standardize.vec(rectfr[, 1])
sty <- standardize.vec(rectfr[, 2])
yprime <- r * stx + sqrt(1 - r^2) * sty
samplingframe <- cbind(stx, yprime)
samplingframe
}
gen.bootstrap <- function(x, nboot, theta, OIBSframe, sampdat, maxjack = 1e+05,
alpha = c(.025, .975)) {
# Generalized bootstrapping function expanded from 'bcanon' function in Efron
# & Tibshirani's (1994) 'bootstrap' package. We used this package because of
# its correspondence to their book (see reference section). However, as the
# package's maintainers note, "This package is primarily provided for projects
# already based on it, and for support of the book. New projects should
# preferentially use the recommended package 'boot'." Most of the changes
# we made to the package were to allow either regular nonparametric
# bootstrapping or Observed Imposed bootstrapping. Some other changes: Any
# bootstrap sample that's all identical is replaced, which occasionally
# happens with small n's, and can be problematic for Monte Carlo studies (see
# Strube, 1988). The originally packaged code had a bug where 'ooo' was
# occasionally rounded to zero, leading to equal low and high bounds for the
# BCa CI. This bug was fixed. The fixed BCa results are labelled 'fixedBCa'
# in the output. This bug also appears to have be fixed in the 2015.2 version
# of the package.
# ---Args:
# x=row indicators; use 1:n ordinarily, and use 1:n^2 for the OI bootstrap
# nboot=number of bootstrap samples (often in the thousands for CIs; 9999
# used in this study)
# theta=function; use calc.theta.1 if regular, calc.thetaOI if OI bootstrap
# OIBSframe=for nonparametric bootstrap use the sample data (n by 2 matrix),
# for OI bootstrap use the OIBSframe generated from gen.OI.frame
# sampdat=sample data (n by 2 matrix)
# maxjack=maximum number of leave-one-out tries for the jackknife estimation
# of the acceralation term in BCa. This parameter will only matter when n
# is large and OI bootstrapping is used. Jackknifing with the OI bootstrap
# can be exceedingly slow because due to a loop across n^2 rather than n
# pairs. Setting maxjack lower than the default speeds up the OI Bootstrap
# by providing an approximate rather than full jackknife.
# Setting maxjack=1000 improved speed for OI with n=160. Time dropped
# from 30 seconds to .7 seconds per simulation. This was necessary in the
# current study because there were 10,000 simulations and 920 scenarios.
# ---Returns:
# thetahat=sample correlation (for debugging purposes only)
# meanthetastar=bootstrap point estimate of the correlation
# confpoints=BCa confidence bounds produced from the original package
# fixedBCa=BCa confidence bounds after fixing the bug that occasionally
# produces inaccurate bounds
# percCI=percentile CI bounds
# AACI=asymptotically adjusted CI bounds
# z0 = bias term from BCa
# acc = acceleration term from BCa
call <- match.call()
thetastar <- rep(NA, nboot)
# If calc.thetaOI then this is the OI bootstrap
if (call[[4]] == "calc.thetaOI") {
n <- round(sqrt(length(x)))
thetahat <- cor(sampdat)[1, 2]
for (curB in 1:nboot) thetastar[curB] <- theta(x, OIBSframe, n)
} else {
n <- length(x)
thetahat <- theta(x, sampdat)
bootsam <- matrix(sample(x, size = n * nboot, replace = TRUE), nrow = nboot)
if (n < 20)
while (sum(apply(bootsam, 1, sd) == 0) > 0) {
rowstoreplace <- which(apply(bootsam, 1, sd) == 0)
for (currow in rowstoreplace) bootsam[currow, ] <- sample(x, size = n,
replace = TRUE)
}
thetastar <- apply(bootsam, 1, theta, sampdat = sampdat)
}
meanthetastar <- mean(thetastar)
# Next eq fits with Hinkley source so as to make results comparable to Beasley
# et al. (2007).
# Efron & Tib. used slightly different eq: