# # Scripts pour le cours "pratique des statistiques paramétriques" # Ecole Doctorale "Science et Agroscience", UAP # # Author: Denis Allard, 6th of June, 2016 # # To be used and circulated freely ################################################################ ################################################################ # # Day 1: Univariate statistics: inference and testing # ################################################################ # 1. Read the data jura = read.table("http://ciam.inra.fr/biosp/sites/ciam.inra.fr.biosp/files/jura_tout.txt",header=T) Ni = jura$Ni Cu = jura$Cu rt = jura$rt lu = jura$lu # Visualize the data par(mfrow=c(1,2)) plot(jura$x,jura$y,main="Sampling design",xlab="X",ylab="Y",pch=3) plot(jura$x,jura$y,main="Sampling design",xlab="X",ylab="Y", pch=19,cex=Ni/20,col="green") # Compute histogram and boxplots par(mfrow=c(1,1)) hist(Ni) boxplot(Ni ~rt,main="Ni ~ rt") boxplot(Ni ~lu,main="Ni ~ lu") # On Ni, histogram looks almost Gaussian par(mfrow=c(1,2)) hist(Ni) hist(log(Ni)) # On Cu, histogram looks almost Log-Normal, since historgam of log(Ni) is almost Gaussian par(mfrow=c(1,2)) hist(Cu) hist(log(Cu)) # 5. Build point estimate and confidence interval for Ni, on one sample of size n n = 30 x = sample(Ni,size=n) x.bar = mean(x) S2 = var(x) # Here we build a CI with the variance S2 estimated on the small sample => We use Student-t distribution # We use Chi^2 distribution for the variance # We choose a level 0.95 for the interval, but this could bet set to other values alpha = 0.05 x.inf = x.bar - qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1) # qt computes the quantile of a t distrib x.sup = x.bar + qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1) s2.inf = S2*(n-1)/qchisq(1-alpha/2,df=(n-1)) # qchisq computes the quantile of a Chi^2 distrib s2.sup = S2*(n-1)/qchisq(alpha/2,df=(n-1)) # Print the CI and the true value: we can check the true value is in the CI print(c(x.inf,mean(Ni),x.sup)) print(c(s2.inf,var(Ni),s2.sup)) # 7. We re-do this analysis on multiple repetitions nrep = 1000 est.mean = rep(0,nrep) est.var = rep(0,nrep) ok.mean = rep(T,nrep) ok.var = rep(T,nrep) for(i in 1:nrep){ x = sample(Ni,size=n) x.bar = mean(x) S2 = var(x) est.mean[i] = x.bar est.var[i] = S2 x.inf = x.bar - qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1) x.sup = x.bar + qt(1-alpha/2,df = (n-1))*sqrt(S2)/sqrt(n-1) s2.inf = S2*(n-1)/qchisq(1-alpha/2,df=(n-1)) s2.sup = S2*(n-1)/qchisq(alpha/2,df=(n-1)) if ( (mean(Ni) < x.inf) | (mean(Ni) > x.sup) ) ok.mean[i] = FALSE if ( (var(Ni) < s2.inf) | (var(Ni) > s2.sup) ) ok.var[i] = FALSE } print(c("Mean within Confidence Intervals:",sum(ok.mean))) print(c("Variance within Confidence Intervals",sum(ok.var))) # Number of times true value is outside CI is approx 1000*alpha ## Plotting the histogram and the theoretical distribution of the sample mean # and sample variance hist(est.mean,xlab="Ni (mg/kg)",main="Averages with n=30", xlim=c(10,35),ylim=c(0,0.35),probability=TRUE) abline(v=mean(Ni),col=3,lwd=3) abline(v=mean(est.mean),col=2,lwd=3) abline(v=mean(Ni)-qnorm(1-alpha/2)*sqrt(var(Ni)/n),col=3,lwd=1) abline(v=mean(Ni)+qnorm(1-alpha/2)*sqrt(var(Ni)/n),col=3,lwd=1) z = seq(10,30,by=0.1) f = dnorm(z,mean=mean(Ni),sd=sqrt(var(Ni)/n)) lines(z,f,type="l",col="blue",lwd=2) legend(12,0.35,legend = c("pop. mean","aver. of aver."),col=c("green","red"),lwd=2) # hist(est.var,xlab="Ni (mg/kg)",main="Variance with n=30",probability=TRUE,ylim=c(0,0.035)) abline(v=var(Ni),col="green",lwd=3) abline(v=mean(est.var),col="red",lwd=3) abline(v=var(Ni)*qchisq(alpha/2,df=n-1)/(n-1),col=3,lwd=1) abline(v=var(Ni)*qchisq(1-alpha/2,df=n-1)/(n-1),col=3,lwd=1) z = seq(range(est.var)[1],range(est.var)[2],by=0.1) f = dchisq(z*(n-1)/var(Ni),df=(n-1))*(n-1)/var(Ni) lines(z,f,type="l",col="blue",lwd=2) legend(30,0.033,legend = c("pop. mean","aver. of aver."),col=c("green","red"),lwd=2) # 8 -- 10 Testing H0 = "equal variance between groups", vs. H1 = "non equal variances" # If H0 not rejected, we do usual t test for testing equality of means # rock.types = c(1,2,3,5) F.stat = matrix(-1,4,4) # negative values will indicate test not done T.stat = matrix(-1,4,4) p.var = matrix(-1,4,4) # p.mean = matrix(-1,4,4) # for (i in 1:(length(rock.types)-1)){ for (j in (i+1):length(rock.types)){ g1 = rock.types[i] g2 = rock.types[j] n1 = sum(rt == g1) n2 = sum(rt == g2) xbar.1 = mean(Ni[rt==g1]) S2.1 = var(Ni[rt==g1]) xbar.2 = mean(Ni[rt==g2]) S2.2 = var(Ni[rt==g2]) F.stat[i,j] = (n1*S2.1/(n1-1)) / (n2*S2.2/(n2-1)) denom = (n1*S2.1 + n2*S2.2) * (1/n1 + 1/n2) T.stat[i,j] = (xbar.1 - xbar.2)*(n1+n2-2)/ denom if (S2.1 > S2.2){ p.var[i,j] = 1 - pf(F.stat[i,j],df1=(n1-1),df2 = (n2-1)) }else{ p.var[i,j] = pf(F.stat[i,j],df1=(n1-1),df2 = (n2-1)) } p.mean[i,j] = 2*(1 - pt(abs(T.stat[i,j]),df=(n1+n2-2))) } } p.var # (1,3) and (2,4) have equal variance p.mean # (3,4) has pvalue 0.167 for "equal mean", but "equal variance" is rejected => should use alternative t.test(Ni[rt==3],Ni[rt==4],var.equal=FALSE) # the test does not reject equality in means