# # Scripts for Unit I: Exploratory Analysis # ############################################" # Loading Ari Quality data data("airquality") Oz = airquality$Ozone ndata = length(Oz) # True mean and variance true.mean <- mean(Oz,na.rm=TRUE) true.var <- var(Oz,na.rm=TRUE) # Uncertainty on the mean when sample o size 100 all.samp <- replicate(100, sample(Oz, size=30)) xbar <- apply(all.samp, 2, mean, na.rm=TRUE) # Representation par(mfrow=c(1,2)) hist(Oz,xlab="Oz (ppm)",main="All samples", xlim=c(0,max(Oz,na.rm=TRUE)),probability=FALSE) hist(xbar,xlab="Oz (ppm)",main="Averages with n=30", probability=TRUE) abline(v=true.mean,col=3,lwd=3) abline(v=mean(xbar),col=2,lwd=3) abline(v=true.mean-1.96*sqrt(true.var/30),col=3,lwd=1) abline(v=true.mean+1.96*sqrt(true.var/30),col=3,lwd=1) z = seq(min(Oz,na.rm=TRUE),max(Oz,na.rm=TRUE),by=0.1) f = dnorm(z,mean=true.mean,sd=sqrt(true.var/30)) lines(z,f,type="l",col="blue",lwd=2) # Confidence Intervals with known variance Lower.Bound = xbar - 1.96*sqrt(true.var/30) Upper.Bound = xbar + 1.96*sqrt(true.var/30) Conf.Inter <- cbind(Lower.Bound,Upper.Bound) # conuting how many times the CI contains the true mean WITHIN <- (Lower.Bound < true.mean) & (true.mean < Upper.Bound) sum(WITHIN) sum(!WITHIN) # Install R package EnvStats install.packages("EnvStats") library(EnvStats) help(EnvStats) # Perform test of variance and t test between different rock types and land use on complete data set var.test(Ni[jura$Rock==1], Ni[jura$Rock==2], alternative = "two-sided",conf.level = 0.95) t.test(Ni[jura$Rock==1], Ni[jura$Rock==2], alternative = "two-sided",conf.level = 0.95) # t test for Ozone data, witn different alternative hypothesis t.test(Oz,alternative = "greater",mu=40) t.test(Oz,alternative = "two.sided",mu=40) # # Scripts for Unit II: Regression # ############################################ # Loading the data data("airquality") airquality # Linear Regression of Ozone wrt Wind fit1 = lm(Ozone ~ Wind,data=airquality) class(fit1) # It belongs to the class "lm" of objects fit1 # Main results of the fit summary(fit1) # str(fi1) # A detailed look at what's in fit1 plot(fit1) # r.sq1 = summary(fit1)$r.squared plot(Ozone ~ Wind, data=airquality) # Looking at the points that cause problems points(airquality$Wind[117],airquality$Ozone[117],pch=19,col="red") points(airquality$Wind[62],airquality$Ozone[62],pch=19,col="red") points(airquality$Wind[101],airquality$Ozone[101],pch=19,col="red") # What if use Wind^2 instead. Note the use of the operator I() help(I) fit2 = lm(Ozone ~ I(Wind^2),data=airquality) fit2 r.sq2 = summary(fit2)$r.squared # What if we do the regression of log(Ozone) fit3 = lm(log(Ozone) ~ Wind,data=airquality) fit3 r.sq3 = summary(fit3)$r.squared # Not very good ... plot(Ozone ~ Wind, data=airquality) # Looking at the points that cause problems points(airquality$Ozone[11],airquality$Wind[11],pch=19,col="red") points(airquality$Wind[21],airquality$Ozone[21],pch=19,col="red") points(airquality$Wind[23],airquality$Ozone[23],pch=19,col="red") # What if do the regression of sqrt(Ozone) fit4 = lm(sqrt(Ozone) ~ Wind,data=airquality) fit4 r.sq4 = summary(fit4)$r.squared plot(Ozone ~ Wind, data=airquality) points(airquality$Ozone[11],airquality$Wind[11],pch=19,col="red") points(airquality$Wind[21],airquality$Ozone[21],pch=19,col="red") points(airquality$Wind[23],airquality$Ozone[23],pch=19,col="red") # What if do the regression of sqrt(Ozone) using Temp instead of Wind fit5 = lm(sqrt(Ozone) ~ Temp,data=airquality) fit5 plot(fit5) r.sq5 = summary(fit5)$r.squared r.sq5 # Much better # What if do the regression of sqrt(Ozone) using Solar.R instead of Wind fit6 = lm(Ozone ~ Solar.R,data=airquality) fit6 plot(fit6) r.sq6 = summary(fit6)$r.squared r.sq6 # Not better fit7 = lm(sqrt(Ozone) ~ Temp + Wind,data=airquality) fit7 plot(fit7) r.sq7 = summary(fit7)$r.squared r.sq7 # We increase R^2 as compared to Temp solely fit8 = lm(sqrt(Ozone) ~ Temp + Wind + Solar.R,data=airquality) fit8 plot(fit8) r.sq8 = summary(fit8)$r.squared r.sq7 # We increase R^2 as compared to Temp + Wind r.adj8 = summary(fit8)$adj.r.squared r.adj7 = summary(fit7)$adj.r.squared r.adj7 r.adj8 # We also increase the adjusted R^2 as compared to Temp + Wind # # Scripts for Unit II: ANOVA # ############################################ # Loading jura dataset install.packages("gstat") library(gstat) data("jura") jura <- rbind(jura.pred,jura.val) # Anova wrt to Rock type fit = lm(Ni ~ Rock,data=jura) summary(fit) aov(Ni ~ Rock, data=jura) summary(aov(fit)) sum((jura$Ni - mean(jura$Ni))^2) boxplot(fit$residuals ~ jura$Rock) abline(h=0,col="blue",lwd=2) # Checking the residuals for each level par(mfrow=c(2,2)) for (k in c(1,2,3,5)) hist(fit$residuals[jura$Rock==levels(jura$Rock)[k]], xlab="Residuals",main=levels(jura$Rock)[k]) # Computing the average of Ni in a two way analaysis Rock Type x Land Use rt = levels(jura$Rock) lu = levels(jura$Landuse) Ni = jura$Ni nrt = length(rt) nlu = length(lu) freqJ = matrix(0,nrow=nrt,ncol=nlu) ; average=freqJ for (i in 1:nrt){ for (j in 1:nlu){ freqJ[i,j] = sum( (jura$Rock==rt[i]) & (jura$Landuse==lu[j]) ) average[i,j] = round( mean((jura$Rock==rt[i]) & (jura$Landuse==lu[j])) ,digits=2) } } freqJ average # Two way anova anova(lm(Ni ~ Rock + Landuse + Rock*Landuse,data=jura)) # Same on a random sample of size 100 jura.sel = jura[sample(1:359)[1:100],] anova(lm(Ni ~ Rock + Landuse + Rock*Landuse,data=jura.sel)) # # Scripts for Unit III: Time Series # ############################################" # Loading Nile data data(Nile) class(Nile) plot(Nile,lwd=2) tt <- as.numeric(time(Nile)) help(poly) poly(tt,degree=2,raw=TRUE) # Regression with polynomial of order 2 fit <- lm(Nile ~ poly(tt,degree=2,raw=TRUE)) lines(tt,predict(fit),col='red',lwd=2) # Regression with polynomial of order 1 fit1 <- lm(Nile ~ tt) lines(tt,predict(fit1),col='blue',lwd=2) plot(fit1) anova(fit1,fit) # Loading Recife data recife.dat <- scan("recife.txt") plot(recife.dat) recife.ts <- ts(recife.dat, start = c(1953, 1), end = c(1962, 12),frequency = 12) plot(recife.ts) monthplot(recife.ts) # Regression with seasonal component using sine and cosine functions tt<-1:length(recife.ts) s1<-sin(tt*2*pi/12) s2<-cos(tt*2*pi/12) fit.periodic<-lm(recife.ts~s1+s2) summary(fit.periodic) pred.periodic<-predict(fit.periodic) plot(recife.ts,lwd=2) lines(as.numeric(time(recife.ts)),pred.periodic,col='blue',lwd=2) # Regression with seasonal + linear trend components fit.complete <- lm(formula = recife.ts ~ poly(tt, 1, raw = TRUE) + s1 + s2) summary(fit.complete) pred.complete<-predict(fit.complete) plot(recife.ts,lwd=2) lines(as.numeric(time(recife.ts)),pred.complete,col='red',lwd=2) # Plots of the residuals res.complete <- residuals(fit.complete) plot(res.complete) res.ts <- ts(res.complete, start = c(1953, 1), end = c(1962, 12),frequency = 12) plot(res.ts) acf(res.ts) # autocorrelation function of the residuals # Local non-parametric regression with loess() on Nile data tt <- as.numeric(time(Nile)) Nile.np <- loess(Nile~tt) summary(Nile.np) plot(Nile,xlab="Time",ylab="Nile") Nile.loess.pred <- predict(Nile.np,data.frame(tt)) lines(tt,Nile.loess.pred,col="blue") Nile.res <- ts(as.numeric(Nile - Nile.loess.pred),start=start(Nile),end=end(Nile)) plot(Nile.res) acf(Nile.res) # autocorrelation function of the residuals # ARIMA modeling of Lake Hurron data acf(LakeHuron) # autocorrelation function of Lake Huron data pacf(LakeHuron) # Partial autocorrelation function of the residuals fit<-arima(LakeHuron,order=c(1,0,1)) # Fit of an ARIMA model LH.pred<-predict(fit,n.ahead=8) LH.pred # # Scripts for Unit IV: Spatial Data # ############################################" install.packages("sp") install.packages("geoR") # Use package sp to handle spatial points library(sp) coords <- matrix(c(5, 6, 60, 60), ncol=2) coords spDists(coords) # Usual (Euclidean) distance spDists(coords, longlat=TRUE) # Great-circle distance library("geoR") # Elevation data set data(elevation) help(elevation) points(elevation) help(points.geodata) points(elevation, cex.min=1, cex.max=4, col="gray") plot(elevation, lowess=T) summary(elevation) names(elevation) # Reading data into geodata format jura <- read.geodata("juraset.dat", header = T,data.col=3:11, skip = 22) names(jura) jura.csv <- read.csv("juraset.csv", header=TRUE,dec=",") jura.geo <- as.geodata(jura.csv, data.col=3:11) # Wolcamp aquifer data set: computing empirical variograms wolf.cloud<-variog(wolfcamp,option="cloud") plot(wolf.cloud,main='Variogram cloud',pch='.') wolf.bin<-variog(wolfcamp,option="bin") plot(wolf.bin) wolf.bin<-variog(wolfcamp,option="bin",bin.cloud=TRUE) plot(wolf.bin,bin.cloud=TRUE) wolf.bin4<-variog4(wolfcamp) plot(wolf.bin4) title(main='Four directions:\n N-S, NE-SW, E-W e SE-NW',cex.main=2) wolf.bin4<-variog(wolfcamp,option="bin", direction=pi/4) wolf.bin2<-variog(wolfcamp,option="bin", direction=pi/2) plot(wolf.bin4,type='l') lines(wolf.bin2$u,wolf.bin2$v,lty=2) # Swiss rainfall datasets: computing empirical variogram data(SIC) points(sic.100, borders = sic.borders, pch=20,cex.max=3) sic.bin<-variog(sic.100,option="bin",estimator.type="classical",max.dist=220) sic.bin<-variog(sic.100,max.dist=220,uvec=20) plot(sic.bin$u,sic.bin$v,pch=20,col=1,ylab="semi-variogram",xlab='h') # Example of simulations n<-100 cov.model<-"exp" cov.pars <- c(1, .25) mysim <- grf(n = n,cov.model= cov.model,cov.pars = cov.pars) points(mysim) plot(mysim) abline(v=max(dist(mysim$coords))/2) n<-441 mysim2 <- grf(n=n, grid = "reg",cov.pars = cov.pars, cov.model = cov.model) image(mysim2) mysim3 <- grf(n=100^2,grid="reg",cov.pars=c(1,.3),cov.model = "exp") # Swiss rainfall datasets - estimation of a theoretical variogram data(SIC) points(sic.100, borders = sic.borders, pch=20,cex.max=3) sic.bin<-variog(sic.100,option="bin",estimator.type="classical",max.dist=220) plot(sic.bin$u,sic.bin$v,pch=20,col=1,ylab="semi-variogram",xlab='h') ini<-c(15000,50) cov.model <- "exp" wls.fit <- variofit(sic.bin, ini = ini, cov.model=cov.model,weights="cressie", fix.nugget=TRUE) ml.fit <- likfit(sic.100, ini = ini, cov.model=cov.model,fix.nugget=TRUE) plot(sic.bin) lines(wls.fit,lty=1) lines(ml.fit,lty=2) legend("bottomright", legend=c("WLS","ML"), lty=c(1,2)) # kriging ngridx<-100 ngridy<-100 xgrid<-seq(min(sic.borders[,1]),max(sic.borders[,1]),l=ngridx) ygrid<-seq(min(sic.borders[,2]),max(sic.borders[,2]),l=ngridy) pred.grid <- expand.grid(xgrid,ygrid) krige.par<-krige.control(type.krige='ok',cov.pars=wls.fit$cov.pars, cov.model=wls.fit$cov.model) ksic<-krige.conv(sic.100,locations=pred.grid,krige=krige.par) image(ksic,main ="Prediction") contour(ksic,add=TRUE) points(sic.100,pch=20,add=TRUE,col="green") se<-sqrt(ksic$krige.var) image(ksic, main ="Square root of MSE") contour(ksic, val=se,add=TRUE) points(sic.100,pch=20,add=TRUE) # Cross-validation xv <- xvalid(sic.100, model = wls.fit) plot(xv) # Same with Gaussian model (inadequate here - for illustration purpose only) cov.model<-"gauss" ini<-c(15000,30) wls.gauss <- variofit(sic.bin,ini=ini, cov.model=cov.model,weights="cressie", fix.nugget=TRUE) xv <- xvalid(sic.100, model = wls.gauss) cv<-mean(xv$std.error^2) print(cv) plot(xv) plot(sic.bin) lines(wls.fit,lwd=1.8) lines(wls.gauss,lty=3,lwd=1.8) legend("bottomright",legend=c("Exponential","Gaussian"),lty=c(1,3),lwd=1.8)