# ---
# Simuations for the example in section 2 of the paper.
# This code is for the S-Plus programming system.
# Functions are modularized into small tasks to generate the data,
# do the t-test, estimate the parameters, derive the corrected p-values,
# and summarize the set of them.
# The centerpiece is the call corP(genData()) that corrects the p-value
# of a generated data set.

# Note: we set a specific (pseudo) random seed, such that re-running
# the simulation will produce identical results.

# ---
# Generation of simulated data.
# Arguments: threshold, number of multivariate Normal random numbers,
# mean, standard deviation, and correlation.

genData <- 
   function(thr = 0, N = 100, mn = c(0,0), sd = c(1,1), rho = 0.7)
{
  val <- array(rnorm(N * 2), c(2, N))
  covFact <- diag(sd) %*% t(chol(array(c(1, rho, rho, 1), c(2, 2))) )
  val <- covFact %*% val + mn
  val <- t(val)
  ind <- val[,1] > thr
  val <- data.frame(val)
  names(val) <- c("x0", "x1")
  attr(val, "ind") <- ind
  attr(val, "thr") <- thr
  val
}

# ---
# Carrying out a paired t-test for equality of means.
# The data set must be a list with two components, x0 and x1.
# The function returns the p-value of the test.

tTest <- 
   function(data, sub = T)
{
  # paired t-test on full data or subset
  if (sub) data <- data[attr(data, "ind"),]
  t.test(data$x0 - data$x1)$p.value
}

# ---
# Estimation of mean and covariance of a multivariate Normal
# (two-dimensional in our application).
# The input data set is assumed to be a matrix with 2 columns.
# The function returns a list containing mean, std dev, and correlation.

estPars <- 
   function(data)
{
  mn <- apply(data, 2, mean)
  cv <- var(data)
  mn <- rep(mean(mn), 2)
  sd <- rep(mean(diag(cv)), 2)
  rho <- cv[1,2]/sqrt(cv[1,1]*cv[2,2])
  list(mn = mn, sd = sd, rho = rho)
}

# ---
# Correcting the p-value:
# This function conducts nSim simulations to correct an observed p-value.
# The uncorrected p-value is calculated here, and the corrected p-value
# is returned by this function.

corP <- 
   function(data, nSim = 100)
{
  pv0 <- tTest(data)
  est <- estPars(data)
  aux <- double(nSim)
  thr <- attr(data, "thr")
  N <- nrow(data)
  for(i in 1:nSim) {
    aux[i] <- tTest(genData(thr, N, est$mn, est$sd, est$rho))
  }
  val <- mean(aux <= pv0)
  attr(val, "pv0") <- pv0
  val
}

# ---
# This code carries out the simulations (!).
# Note that up to here we have only declarations of functions, 
# but nothing gets executed.
# The following executes the simulations.
# Probably the only thing you want to modify is the value of nsim.
# On a standard PC, 1,000 simulations take a good half hour.

# random number initialization
seed <- c(61, 11, 51, 37, 16,  2, 39, 18, 12, 29, 31,  2)
set.seed(seed)

nsim <- 1000

## Comparing p-values for full data, subset unadjusted and subset adjusted
pVal <- array(0, c(nsim, 3), list(1:nsim, c("Full", "Obs", "Adj")))

date()
cat("Running", nsim, "simulations:\n")

for(i in 1:nsim) {
  if (i %% 50 == 0) cat(i, " ")
  dat <- genData()
  aux <- corP(dat)
  pVal[i, ] <- c(tTest(dat, sub = F), attr(aux, "pv0"), aux)
}

# ---
# the central result: unadjusted and adjusted p-values

pVal <- data.frame(pVal)	

# ---
# Create graphs - need to use the line below in R, but not in S-PLUS
#library(lattice) 
trellis.device(color = F)

# create data sets suitable for grouped histograms
# Figure 1
pdata1 <- data.frame(pval=c(pVal$Full, pVal$Obs),
	               group=factor(rep(c("Full data", "x0 > 0"), each = nrow(pVal)),
                                  levels = c("x0 > 0", "Full data")))
# histogram of p-values
histogram(~pmax(pval, 0.00001)| group, pdata1, xlab = "p-values", # main = "Distribution of p-values",
          breaks=seq(0, 1, by=0.05),
	    layout=c(1, 2),	# 2 rows of one graph each
	    strip=function(...) strip.default(..., style=1),	# take out that bar in the strip
	    par.strip.text=list(cex=1.2))	# larger characters in strips

# Figure 2
pdata2 <- data.frame(pval=c(pVal$Obs, pVal$Adj),
                     group=factor(rep(c("Observed", "Adjusted"), each = nrow(pVal)),
                                  levels = c("Adjusted", "Observed")))

# histogram of p-values
histogram(~pmax(pval, 0.00001)| group, pdata2, xlab = "p-values", 
          breaks=seq(0, 1, by=0.05),
	    layout=c(1, 2),	# 2 rows of one graph each
	    strip=function(...) strip.default(..., style=1),	# take out that bar in the strip
	    par.strip.text=list(cex=1.2))	# larger characters in strips

# Figure 3
xyplot(Adj ~ Obs, pVal, 
	xlab = "Observed p-value", ylab = "Adjusted p-value",
	aspect=1, 
#      scales=list(limits=c(-0.05, 1.05)),  # for R
      scales=list(limits=c(0, 1)),         # for S-PLUS
      panel = function(x, y, ...)
		{
       	panel.xyplot(x, y, ...)
			if(length(x) > 50)
				panel.loess(x, y, span = 0.3, col = 1, lwd = 3)
		}
) 

# ---
# Finally show the central result in numbers:
# Fraction of significant calls before and after adjustment
# (mean of the 0/1 variable whether or not each p-value is below 0.05)

sig.calls <- c(
	"observed"=mean(pVal$Obs <= 0.05),
	"adjusted"=mean(pVal$Adj <= 0.05)
)

cat("Fraction of significant calls before and after correction:\n")
print(sig.calls)

# ---
# The End.
# The full data, observed and adjusted p-values are in pVal, 
# the summary statistics (fraction of significant calls) in sig.calls,
# and the graphs are stored in graphs.