#
# Scripts pour le cours "pratique des statistiques paramétriques"
# Ecole Doctorale "Science et Agroscience", UAP
#
# Author: Denis Allard, 6th of June, 2016
#
# To be used and circulated freely
################################################################




################################################################
#
#  Day 1: Univariate statistics: inference and testing
#
################################################################


# 1. Read the data
jura = read.table("http://ciam.inra.fr/biosp/sites/ciam.inra.fr.biosp/files/jura_tout.txt",header=T)
Ni   = jura$Ni
Cu   = jura$Cu
rt = jura$rt
lu = jura$lu

# Visualize the data
par(mfrow=c(1,2))
plot(jura$x,jura$y,main="Sampling design",xlab="X",ylab="Y",pch=3)
plot(jura$x,jura$y,main="Sampling design",xlab="X",ylab="Y",
     pch=19,cex=Ni/20,col="green")

# Compute histogram and boxplots
par(mfrow=c(1,1))
hist(Ni)
boxplot(Ni ~rt,main="Ni ~ rt")
boxplot(Ni ~lu,main="Ni ~ lu")

# On Ni, histogram looks almost Gaussian
par(mfrow=c(1,2))
hist(Ni)    
hist(log(Ni))   


# On Cu, histogram looks almost Log-Normal, since historgam of log(Ni) is almost Gaussian
par(mfrow=c(1,2))
hist(Cu)    
hist(log(Cu))   


# 5. Build point estimate and confidence interval for Ni, on one sample of size n
n = 30
x = sample(Ni,size=n)
x.bar = mean(x)
S2   = var(x)

# Here we build a CI with the variance S2 estimated on the small sample => We use Student-t distribution
# We use Chi^2 distribution for the variance
# We choose a level 0.95 for the interval, but this could bet set to other values

alpha = 0.05
x.inf = x.bar - qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1) # qt computes the quantile of a t distrib
x.sup = x.bar + qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1)
s2.inf = S2*(n-1)/qchisq(1-alpha/2,df=(n-1))   # qchisq computes the quantile of a Chi^2 distrib
s2.sup = S2*(n-1)/qchisq(alpha/2,df=(n-1))


# Print the CI and the true value: we can check the true value is in the CI
print(c(x.inf,mean(Ni),x.sup))
print(c(s2.inf,var(Ni),s2.sup))

# 7. We re-do this analysis on multiple repetitions
nrep     = 1000
est.mean = rep(0,nrep)
est.var  = rep(0,nrep)
ok.mean  = rep(T,nrep)
ok.var   = rep(T,nrep)
for(i in 1:nrep){
  x     = sample(Ni,size=n)
  x.bar = mean(x)
  S2    = var(x)
  est.mean[i] = x.bar
  est.var[i]  = S2 
  x.inf = x.bar - qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1)
  x.sup = x.bar + qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1)
  s2.inf = S2*(n-1)/qchisq(1-alpha/2,df=(n-1))
  s2.sup = S2*(n-1)/qchisq(alpha/2,df=(n-1))
  
  if ( (mean(Ni) < x.inf) | (mean(Ni) > x.sup) ) ok.mean[i] = FALSE
  if ( (var(Ni)  < s2.inf) | (var(Ni) > s2.sup) ) ok.var[i] = FALSE
}
print(c("Mean within Confidence Intervals:",sum(ok.mean)))    
print(c("Variance within Confidence Intervals",sum(ok.var)))
# Number of times true value is outside CI is approx 1000*alpha 


## Plotting the histogram and the theoretical distribution of the sample mean
#   and sample variance

hist(est.mean,xlab="Ni (mg/kg)",main="Averages with n=30",
     xlim=c(10,35),ylim=c(0,0.35),probability=TRUE)
abline(v=mean(Ni),col=3,lwd=3)
abline(v=mean(est.mean),col=2,lwd=3)
abline(v=mean(Ni)-qnorm(1-alpha/2)*sqrt(var(Ni)/n),col=3,lwd=1)
abline(v=mean(Ni)+qnorm(1-alpha/2)*sqrt(var(Ni)/n),col=3,lwd=1)
z = seq(10,30,by=0.1)
f = dnorm(z,mean=mean(Ni),sd=sqrt(var(Ni)/n))
lines(z,f,type="l",col="blue",lwd=2)
legend(12,0.35,legend = c("pop. mean","aver. of aver."),col=c("green","red"),lwd=2)


#
hist(est.var,xlab="Ni (mg/kg)",main="Variance with n=30",probability=TRUE,ylim=c(0,0.035))
abline(v=var(Ni),col="green",lwd=3)
abline(v=mean(est.var),col="red",lwd=3)
abline(v=var(Ni)*qchisq(alpha/2,df=n-1)/(n-1),col=3,lwd=1)
abline(v=var(Ni)*qchisq(1-alpha/2,df=n-1)/(n-1),col=3,lwd=1)

z = seq(range(est.var)[1],range(est.var)[2],by=0.1)
f = dchisq(z*(n-1)/var(Ni),df=(n-1))*(n-1)/var(Ni)
lines(z,f,type="l",col="blue",lwd=2)
legend(30,0.033,legend = c("pop. mean","aver. of aver."),col=c("green","red"),lwd=2)


# 8 -- 10 Testing H0 = "equal variance between groups", vs. H1 = "non equal variances"
#         If H0 not rejected, we do usual t test for testing equality of means
#
rock.types = c(1,2,3,5)
F.stat = matrix(-1,4,4) # negative values will indicate test not done
T.stat = matrix(-1,4,4)
p.var  = matrix(-1,4,4) # 
p.mean = matrix(-1,4,4) #

for (i in 1:(length(rock.types)-1)){
  for (j  in (i+1):length(rock.types)){
    g1 = rock.types[i]
    g2 = rock.types[j]
    n1 = sum(rt == g1)
    n2 = sum(rt == g2)
    xbar.1 = mean(Ni[rt==g1])
    S2.1   = var(Ni[rt==g1])
    xbar.2 = mean(Ni[rt==g2])
    S2.2   = var(Ni[rt==g2])
    F.stat[i,j]  = (n1*S2.1/(n1-1)) / (n2*S2.2/(n2-1))  
    denom  = (n1*S2.1 + n2*S2.2) * (1/n1 + 1/n2)
    T.stat[i,j]  = (xbar.1 - xbar.2)*(n1+n2-2)/ denom
    if (S2.1 > S2.2){
      p.var[i,j]  = 1 - pf(F.stat[i,j],df1=(n1-1),df2 = (n2-1))
    }else{
      p.var[i,j] = pf(F.stat[i,j],df1=(n1-1),df2 = (n2-1))
    }
    p.mean[i,j] = 2*(1 - pt(abs(T.stat[i,j]),df=(n1+n2-2)))
  }
}
p.var  # (1,3) and (2,4) have equal variance
p.mean # (3,4) has pvalue 0.167 for "equal mean", but "equal variance" is rejected => should use alternative
t.test(Ni[rt==3],Ni[rt==4],var.equal=FALSE) # the test does not reject equality in means