### split moving window (SMW) boundary-finding
## refs. Webster: J. Soil Sci. 29:388; Comp. GeoSci. 6:61
## Programmer: D G Rossiter, B Natural Consulting, Ithaca NY USA
## last modified:2013-10-16

### functions:
##       smw.pc    : extract PCs
##       smw.dw    : moving window analysis
##       smw.graph : visualise results of smw.dw()

### usage
## 1. set up a transect to be analyzed as a data frame
## 2. load this file into the R workspace
## 3. smw.pc() to extract the PC's; this returns the selected PC's
##    Note: you can skip this step if you want to work on the original
##      variables; in that case you may want to at least scale() them,
##      otherwise the largest numerically will dominate
##      but if they all represent the same kind of data maybe unscaled
##      is also interesting
## 4. Examine PC's to see what they represent. The following steps
##    are finding boundaries based on these.
## 5. acf() on the first PC to get some idea of the boundary spacing
##    Webster recommends a window of 2/3 of this expected spacing
##    (where the autocorrelation function approaches zero)
## 6. acf() on higher PCs which explain some variation; only use those
##     with some auto-correlation to determine the boundaries
## 7. smw.dw() to analyze the windows; this returns the analyzed transect
##    This gives you a sorted list of the candidate boundaries
## 8. open a graphics window, e.g. window(), or to a file e.g. pdf()
## 9. smw.graph() to visualise

### method smw.pc()
##    extracts standardized PC's from a multivariate transect
## arguments
##      transect: data frame; rows are equally-spaced,
##                columns are variables
##      n.pc    : number of PCs to extract, default 2
## returns
##      data frame with columns as PC scores named "PC.1" etc.
smw.pc <- function(transect, n.pc=2)
  {
                # can't ask for more PCs than variables
    n.pc <- min(n.pc, length(transect[1,]))
                # standardized PCs with scores
    pc <- prcomp(transect, center=T, scale=T, retx=T)
                # frame to return, with named synthetic variables
    pcs <- as.data.frame(pc$x[,1:n.pc])
                # print some diagnostics
    print(paste("PCA: selected component",
                ifelse(n.pc==1,"","s"),
                " explain",
                ifelse(n.pc==1,"s "," "),
                round(sum((pc$sdev^2/(sum(pc$sdev^2)))[1:n.pc]),3),
                " of the variance", sep=""))
    print(paste("Loadings for the first",
                ifelse(n.pc==1,"",paste("",n.pc))," component",
                ifelse(n.pc==1,"","s"),":", sep=""))
    print(pc$rotation[,1:n.pc])
                # return frame of synthetic variables (PC scores)
    return(pcs)
  }

### method smw.dw()
##     find Mahalanobis distances for one transect and window size
## arguments
##      tsect: data frame; rows are equally-spaced observations,
##                columns are variables
##        wid: points in moving window
##       mull: points to omit at boundary from each half-window ("mullion")
##      p.mmd: proportion of maximum Mahalanobis distance for a
##             boundary to show in the summary table
##      ident: calculate Euclidean (T) or Mahalanobis (F, default) distances?
## returns
##      data frame with station and D2 (distance) in station order;
##         note l. and r. ends will be 0 (not in any window)
## errors
##      traps singular pooled covariance matrix and reports position
smw.dw <- function(tsect, wid=floor(length(tsect[,1])/4), mull=0, p.mmd=.25,
                   ident=FALSE)
{
                # silently correct unreasonable arguments
  mull=floor(mull); wid=floor(wid)
  p.mmd <- min(1, p.mmd); p.mmd <- max(0.05, p.mmd)
                # check if arguments make sense
  if (mull < 0) {
    print("Error: mullion must be positive");
    return(NULL) }
  if (mull > (wid/8)) {
    print("Error: mullion must be less than 1/4 of the half-window width");
    return(NULL) }
                # length of sequence
  len <- dim(tsect)[1]
  if ((wid < 0) || (wid > floor(len/2))) {
    print(paste(
       "Error: width must be positive and less than 1/2 of transect length:",
        len));
    return(NULL) }
                # number of variables
  nvar <- dim(tsect)[2]
                # offset from left to possible boundary;
                # for odd widths, this is the station to its left
  b.offset <- floor(wid/2)
                # half width of interval, maybe mullioned
                # remove one (common) point for even widths
  hwid <-  b.offset  - (!(wid%%2)) - mull
                # collect d's with their centre
                # both ends will be 0 (not wide enough for a window)
  d <- vector(mode="numeric", length=len)
                # move left boundary of window along transect
  for (l in 1:(len-wid)) {
                # right boundary of window
    r <- l + wid
                # option: Mahalanobis distance
                # extract halves
    win.l <- tsect[l:(l+hwid),]; win.r <- tsect[(r-hwid):r,]
                # compute column-wise means, or one mean for a vector
    if (is.null(dim(win.l))) {
        win.l.mean <- mean(win.l); win.r.mean <- mean(win.r)
    }
    else {
        win.l.mean <- apply(win.l, 2, mean)
        win.r.mean <- apply(win.r, 2, mean)
    }
    if (ident==F) {
                                        # pool halves after subtracting means
        pool <- (rbind(t(t(win.l) - win.l.mean), t(t(win.r) - win.r.mean)))
                                        #  store D^2 by boundary location
        tmp <- try(d[l + b.offset] <-
                   mahalanobis(win.l.mean, win.r.mean, cov(pool)),
                   silent=TRUE)
        if (class(tmp) == "try-error") {
            d[l + b.offset] <- 0
            print(paste("Position ", l+hwid,
                        ": Singular pooled covariance matrix; not a boundary",
                        sep=""))
        }
    }
                                        # option: Euclidean distance
    else
        d[l + b.offset] <- mahalanobis(win.l.mean, win.r.mean, diag(nvar))
}
                # sort in decreasing order of probable boundary
  ds <- sort(d, index=T, decreasing=T)
  ds <- data.frame(station=ds$ix[ds$x>0], D2=ds$x[ds$x>0])
  print(paste("Window:",wid,"stations; mullion: ",mull))
  print(paste("Distances up to", p.mmd, "of the maximum"))
  print(paste("Boundary",ifelse(!(wid%%2),"at","to the right of"),"station")) 
  print(ds[ds$D2 >= (ds$D2[1]*p.mmd),])
  return(data.frame(station=seq(1:length(d)), D2=d))
}

### method smw.graph()
##     visualise possible boundaries
## graphical output:
##      transect by station, with distances
##        1 - connected by lines
##        2 - as vertical bars for most significant (highlight)
## required argument:
##      ds : data frame returned by smw.dw; this is sorted by station
## optional arguments:
##      p.mmd: proportion of maximum  distance for a
##             boundary to be highlighted on the graph
##      text : extra info. to printed as graph subtitle
##      vcol : colour of vertical bars and their text (possible boundary)
##      ymax : maximum squared distance (y-axis); allows comparaison
##                of several graphs on the same scale
## returns: nothing
smw.graph <- function (ds, p.mmd=.25, text="", vcol="blue", ymax=NULL) {
                # sanity check on proportion of maximum to call a boundary
  p.mmd <- min(1, p.mmd); p.mmd <- max(0.05, p.mmd)
  if (dev.cur() >1) {
                # the transect
    ylim <- (if (is.null(ymax)) NULL  else c(0, ymax))
    plot(ds, xlab="station", ylab="squared distance between halves",
         type="l", main="Boundaries?", sub=text, ylim = ylim)
    abline(h=0, lty=2)
                # main boundaries -- compare to imposed maximum if any
                #  otherwise within-transect maximum
    d.sig <- ds[ds$D2 > ifelse(is.null(ymax), max(ds$D2), ymax)*p.mmd,]
    for (i in 1:length(d.sig$D2)) {
      s <- d.sig$station[i]; d2 <- d.sig$D2[i]
      lines(c(s, s), c(0, d2), col=vcol)
      text(s,  d2, s, pos=4, col=vcol)
    }
  }
}