# REFERENCES # http://www.stanford.edu/~hastie/Papers/biodiversity/Biodiversity.pdf # Julia Viladomat, Rahul Mazumder, Alex McInturff, Douglas J. McCauley, # and Trevor Hastie (2013) # Assessing the Significance of Global and Local Correlations under # Spatial Autocorrelation; a Nonparametric Approach. ####### README # The code for the algorithm proposed in the above paper contains two functions, # main and matching, with comments along the way to clarify the steps. # At the end of this file we also include a small running example for illustration purposes. ####### # Input parameters to main: # data: data frame containing the sample (X, Y) (that one wishes to study # their correlation) and their coordinates for the N data locations. # Delta: set of values for the proportion of neighbors to consider # for the smoothing step. No default. The user may have to experiment # with different values to find one suitable for their data. # matching function. matching <- function( X.randomized, long, lat, Delta, target_variog, prctile, ids, i ) { variog.X.delta <- vector(mode = "list", length = length(Delta)) linear.fit <- variog.X.delta hat.X.delta <- variog.X.delta resid.sum.squares <- rep(0, length(Delta)) for (k in 1:length(Delta)) { # smooth X.randomized using locfit: fit <- locfit(X.randomized ~ lp(long, lat, nn = Delta[k], deg = 0), kern = "gauss", maxk = 300) X.delta <- fitted(fit) # variogram of X.delta: variog.X.delta[[k]] <- variog(data = X.delta[ids], coords = cbind(long[ids],lat[ids]), option = "bin", max.dist = prctile, messages = FALSE) # linear regression between the target and X.delta variograms: linear.fit[[k]] <- lm(target_variog\$v ~ 1 + variog.X.delta[[k]]\$v) # least square estimates: bet.hat <- as.numeric(linear.fit[[k]]\$coefficients) # transformed X.delta: hat.X.delta[[k]] <- X.delta * sqrt(abs(bet.hat[2])) + rnorm(length(X.delta)) * sqrt(abs(bet.hat[1])) variog.hat.X.delta <- variog(data = hat.X.delta[[k]][ids], coords = cbind(long[ids],lat[ids]), option = "bin", max.dist = prctile, messages = FALSE) # sum of squares of the residuals: resid.sum.squares[k] <- sum((variog.hat.X.delta\$v-target_variog\$v) ^ 2) } # delta that minimizes the residual sum of squares: delta.star.id <- which.min(resid.sum.squares) hat.X.delta.star <- hat.X.delta[[delta.star.id]] return(list(residus = resid.sum.squares, delta.star.id = delta.star.id, hat.X.delta.star = hat.X.delta.star)) } # main function. main <- function(data, Delta) { library(geoR) library(locfit) library(foreach) library(doMC) no.wrks<-20 registerDoMC(no.wrks-1) # load the data X, Y and the coordinates of the N data locations: X <- data[,1] Y <- data[,2] lat <- data[,3] long <- data[,4] N <- length(X) # If N is too big, we take a subsample of size N_s every time we calculate a variogram: N_s <- 1000 if (length(X) > N_s) { ids <- sample(N, N_s) X_s <- X[ids] long_s <- long[ids] lat_s <- lat[ids] } else { X_s <- X long_s <- long lat_s <- lat ids <- 1:N } dists <- dist(cbind(lat_s, long_s)) # maximum distance for the variogram set at the 25% percentile of # the distribution of pairs of distances: prctile <- quantile(dists, probs = 0.25) # variogram of variable X that will be used as target when doing the matching: target_variog <- variog(data = X_s, coords = cbind(long_s,lat_s), max.dist = prctile, option = "bin", messages = FALSE) # ALGORITHM: # It returns B random fields with the same autocorrelation # as X but independent of Y, stored in permutations. The basis # to calculate B realizations of the null we are interested in. B <- 1000 permutations <- vector(mode = "list", length = B) permutations <- foreach(i = 1:B, .inorder = TRUE, .combine = cbind) %dopar% { print(c('i=', i)) # random permutation of the values of X across locations: X.randomized <- sample(X, size = length(X), replace = FALSE) # smoothing and scaling step to match the target variogram: output <- matching(X.randomized, long, lat, Delta, target_variog, prctile, ids, i) hat.X.delta.star <- output\$hat.X.delta.star c(hat.X.delta.star) } # ASSESSING THE SINGLE PEARSON'S CORRELATION COEFFICIENT (GLOBAL CORRELATION): # observed global correlation: cor.global.obs <- as.vector(cor(X,Y)) # null distribution for the global correlation: cor.global <- cor(permutations,Y) extreme <- sum(abs(cor.global) > abs(cor.global.obs)) # p-value: p.value.global <- extreme / B # ASSESSING THE CORRELATION BETWEEN X AND Y IN A GIVEN NEIGHBORHOOD (LOCAL CORRELATIONS) # whether to use a varying bandwidth to calculate the local correlations or not: varying.bandwidth = TRUE # observed local correlations calculated using locfit: XY <- X*Y X2 <- X^2 Y2 <- Y^2 if (varying.bandwidth) { # delta: proportion of neighbors (varying bandwitdh). It is set to 10% of N, but it may be changed by the user to find one more suitable for their data. delta <- 0.1 fitX <- fitted(locfit(X ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) fitY <- fitted(locfit(Y ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) fitXY <- fitted(locfit(XY ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) fitX2 <- fitted(locfit(X2 ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) fitY2 <- fitted(locfit(Y2 ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) } else { # lambda: non-varying bandwidth. The user may have to experiment with different values to find one suitable for their data. lambda <- 5.281 fitX <- fitted(locfit(X ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) fitY <- fitted(locfit(Y ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) fitXY <- fitted(locfit(XY ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) fitX2 <- fitted(locfit(X2 ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) fitY2 <- fitted(locfit(Y2 ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) } loc.cor.obs <- (fitXY - fitX * fitY) / (sqrt((fitX2 - fitX ^ 2) * (fitY2 - fitY ^ 2))) # loc.cor[i,] is the null distribution for the observed local correlation loc.cor.obs[i] at location i=1:N : loc.cor <- vector(mode = "list", length = B) loc.cor <- foreach(k = 1:B, .inorder = TRUE, .combine = cbind) %dopar% { Xper <- permutations[,k] XYper <- Xper*Y Xper2 <- Xper^2 if (varying.bandwidth) { fitXper <- fitted(locfit(Xper ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) fitXYper <- fitted(locfit(XYper ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) fitXper2 <- fitted(locfit(Xper2 ~ lp(long, lat, nn = delta, deg = 0), kern = "gauss", maxk = 300)) } else { fitXper <- fitted(locfit(Xper ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) fitXYper <- fitted(locfit(XYper ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) fitXper2 <- fitted(locfit(Xper2 ~ lp(long, lat, h = lambda, deg = 0), kern = "gauss", maxk = 300)) } r_k <- (fitXYper - fitXper * fitY) / (sqrt((fitXper2 - fitXper ^ 2) * (fitY2 - fitY ^ 2))) } # p-value for each location i=1:N by counting the observations of the null distribution loc.cor[i,] more extreme than the observed loc.cor.obs[i] p.value.loc.cor <- rep(0,N) for (i in 1:N) { extreme <- sum(abs(loc.cor[i,]) > abs(loc.cor.obs[i])) p.value.loc.cor[i] <- extreme/B } return(list(permutations = permutations, p.value.global = p.value.global, p.value.loc.cor = p.value.loc.cor)) } ########################### # The core of the main function is the algorithm proposed in the paper. # It returns B=1000 random fields with the same autocorrelation as X # but independent of Y, stored in variable permutations. The algorithm # calls the matching function, which smoothes the permuted X to match # the target variogram of X at every B. # In addition to that, function main also calculates: # 1. The null distribution for the Pearson's correlation coefficient # (global correlation) once the autocorrelation has been accounted for, # and the corresponding p-value to assess whether the observed # correlation coefficient between X and Y - cor(X,Y) - is significant. # 2. A map of p-values, one for each location N, (and the nulls used to obtain them), # to assess if the observed correlations between X and Y at a given neighbourhood # (local correlations) are significant. The local correlations are calculated # using the package locfit. ########################### # RUNNING EXAMPLE -- Simulation of a Gaussian random field with # Gaussian autocorrelation function library(RandomFields) # grid h <- 1 step <- 0.01 # resolution of the domain x.grid <- c (0, h, step) model <- "gauss" # parameters variance <- 1 scale <- 0.3 # smoothing parameter mean <- 0 nugget <- 0 # Two independent realizations of the random field f <- GaussRF(x = x.grid, y = x.grid, model = model, grid = T, gridtriple = T, param = c( mean, variance, nugget, scale ), n = 2) n.grid <- seq (0, h, step) X <- f[ , , 1] Y <- f[ , , 2] dim(X) <- c(length (n.grid) * length (n.grid), 1) dim(Y) <- dim(X) coords <- cbind(rep(n.grid, times = length (n.grid)), rep(n.grid, each = length(n.grid))) # data to feed the algorithm with data <- data.frame (X, Y, coords) N <- 100 ids <- sample(nrow(data), N) data <- data [ids, ] # small subsample # call to main. The example takes ~ 20 secs on OS X 10.8.5 - 2.7 GHz Intel Core i7 - 16 GB 1600 MHz DDR3 # the following value for Delta works for the runing example. The user may have to experiment with different values to find one suitable for their data. Delta <- seq(0.1,0.9,0.1) results <- main(data = data , Delta = Delta)