## This script is a supplementary file to the following paper:
## Soltysiak A. (2011), Technical Note: An R script for Smith's Mean Measure of Divergence,
## "Bioarchaeology of the Near East" 5:41-44.

## -------------SECTION A: INPUT-----------------------------------------------

file_n = choose.files(default = "", caption = "Select a CSV file with numbers of observations", multi = FALSE)
if (file_n == "") stop("The file with numbers of observation was not loaded!")
M_n = read.csv(file=file_n)
if (file_n == "") file_n = "ERROR"

file_p = choose.files(default = "", caption = "Select a CSV file with trait proportions", multi = FALSE)
if (file_p == "") stop("The file with trait proportions was not loaded!")
M_p = read.csv(file=file_p)
if (file_p == "") file_p = "ERROR"

## -------------SECTION B: MMD FORMULA-----------------------------------------

## Anscombe transformation
theta <- function(n,p) { asin((n/(n+3/4))*(1-2*p)) }
## Freeman & Tukey transformation
#theta <- function(n,p) { 0.5*(asin(1-(2*p*n/(n+1)))+asin(1-(2*((p*n)+1)/(n+1)))) }

## Freeman & Tukey correction
thetadiff <- function(nA,pA,nB,pB) { (theta(nA,pA) - theta(nB,pB))^2 - (1/(nA+0.5) + 1/(nB+0.5)) }
## Grewal correction
#thetadiff <- function(nA,pA,nB,pB) { (theta(nA,pA) - theta(nB,pB))^2 - (1/nA + 1/nB) }
## uncorrected formula
#thetadiff <- function(nA,pA,nB,pB) { (theta(nA,pA) - theta(nB,pB))^2 }

## -------------SECTION C: VARIABLE STATUS-------------------------------------

VarMatrix <- M_n[1:2,2:length(M_n[1,])]
MMDMatrix <- matrix(0,length(M_n[,1]),length(M_n[,1]))

for (a in seq_along(VarMatrix[1,])) {
for (b in seq_along(MMDMatrix[,1])) {
for (c in seq_along(MMDMatrix[1,])) {
## percent frequencies
MMDMatrix[b,c] <- thetadiff(M_n[b,a+1], M_p[b,a+1]/100, M_n[c,a+1], M_p[c,a+1]/100)
## proportions
#MMDMatrix[b,c] <- thetadiff(M_n[b,a+1], M_p[b,a+1], M_n[c,a+1], M_p[c,a+1])
} }

for (b in seq_along(MMDMatrix[,1])) {
for (c in seq_along(MMDMatrix[1,])) {
if (b >= c) { MMDMatrix[b,c] = 0 } } }

VNeg <- 0
VPos <- 0
for (b in seq_along(MMDMatrix[,1])) {
for (c in seq_along(MMDMatrix[1,])) {
if (MMDMatrix[b,c] > 0) { VPos = VPos + 1 }
if (MMDMatrix[b,c] < 0) { VNeg = VNeg + 1 } } }

VarMatrix[1,a] = sum(MMDMatrix)
VarMatrix[2,a] = VPos / (VPos + VNeg) }

VarStatus <- t(VarMatrix)

## -------------SECTION D: MMD MATRIX------------------------------------------

MMDMatrix <- matrix(0,length(M_n[,1]),length(M_n[,1]))
dimnames(MMDMatrix) <- list(M_n[,1], M_n[,1])

for (a in seq_along(MMDMatrix[,1])) {
for (b in seq_along(MMDMatrix[1,])) {
MMDVect <- vector("double",length(M_n[1,])-1)
## percent frequencies
for (i in seq_along(MMDVect)) { MMDVect[i] <- thetadiff(M_n[a,i+1], M_p[a,i+1]/100, M_n[b,i+1], M_p[b,i+1]/100) }
## proportions
#for (i in seq_along(MMDVect)) { MMDVect[i] <- thetadiff(M_n[a,i+1], M_p[a,i+1], M_n[b,i+1], M_p[b,i+1]) }
MMDMatrix[a,b] <- sum(MMDVect) / length(MMDVect) } }

## forced 0 when a sample is compared to itself
#for (a in seq_along(MMDMatrix[,1])) { MMDMatrix[a,a] = 0 }

## -------------SECTION E: SD MATRIX-------------------------------------------

## standard deviation for MMD, Sjovold's formula
SDMatrix <- matrix(0,length(M_n[,1]),length(M_n[,1]))
dimnames(SDMatrix) <- list(M_n[,1], M_n[,1])
SDDiff <- function(nA,nB) { (1/nA + 1/nB)^2 }

for (a in seq_along(MMDMatrix[,1])) {
for (b in seq_along(MMDMatrix[1,])) {
SDVect <- vector("double",length(M_n[1,])-1)
for (i in seq_along(SDVect)) { SDVect[i] <- SDDiff(M_n[a,i+1], M_n[b,i+1]) }
SDMatrix[a,b] <- sqrt(sum(SDVect)*2 / length(SDVect)^2) } }

## -------------SECTION F: SIGNIFICANCE MATRIX---------------------------------

## statistical significance
SigMatrix <- matrix(1,length(M_n[,1]),length(M_n[,1]))
dimnames(SigMatrix) <- list(M_n[,1], M_n[,1])

for (a in seq_along(MMDMatrix[,1])) {
for (b in seq_along(MMDMatrix[1,])) {

dist <- MMDMatrix[a,b] / SDMatrix[a,b]
SigMatrix[a,b] = round((1-pnorm(dist))*2, digits = 8)
if (MMDMatrix[a,b] < 0) SigMatrix[a,b] = 1
} }


## -------------SECTION G: OUTPUT----------------------------------------------

VarStatus
#write.csv(VarStatus, file="output_vars.csv")

MMDMatrix
#write.csv(MMDMatrix, file="output_mmd.csv")

SDMatrix
#write.csv(SDMatrix, file="output_mmd-sd.csv")

SigMatrix
#write.csv(SigMatrix, file="output_mmd-p.csv")

cat("The script was executed for files", file_n, "and", file_p, ".", fill = TRUE)