###### Example of Sampling design approaches
##### In these examples, we foresee of collecting 20 samples at 20 distant sites.
### The 20 sites are chosen among the 50km cells of the grids of Kenya (each cell can be a potential sampling site).
library(raster) # load the raster library
shp = shapefile('kenya50kmG.shp') # open the shapefile
env = as.data.frame(shp) # transform to dataframe
### Approach 1) Maximize Geographical distance
D=dist(env[,c('longitude', 'latitude')]) # calculate distance between each site
CL=cutree(hclust(D), k=20) # create 20 spacial clusters based on position
# This map returns 20 cluster of cells, based on geographical distance.
# We could then choose one cell per cluster, and sample there the 20 individuals.
plot(env$longitude, env$latitude, col=CL, pch=CL, main='Geographical Distance Clusters')
shp$GeoCL = CL # add the clustering to the shapefile
### Approach 2) Maximize Environmental distance
PCA=prcomp(env[,-c(1,21,22)]) # compress environmental information via PCA
plot(cumsum(PCA$sdev)/sum(PCA$sdev), ylab='% of explained variance', xlab='PC')
abline(h=0.9) # check how many PCs contain 90% of total variation
D=dist(PCA$x[,1:5]) # calculate environmental distance between each site, using only PCs explaining 90% of variance
CL=cutree(hclust(D), k=20) # create 20 spacial clusters based on environmental value
# This map returns 20 cluster of cells, based on their environmental variability.
# We could then choose one cell per cluster, and sample there the 20 individuals.
plot(env$longitude, env$latitude, col=CL, pch=CL, main='Environmental Distance Clusters')
shp$EnvCL = CL # add the clustering to the shapefile
### Approach 3) Maximize Environmental distance first, then maximize geographical one.
# We start as in the previous case
PCA=prcomp(env[,-c(1,21,22)]) # compress environmental information via PCA
plot(cumsum(PCA$sdev)/sum(PCA$sdev), ylab='% of explained variance', xlab='PC')
abline(h=0.9) # check how many PCs contain 90% of total variation
D=dist(PCA$x[,1:5]) # calculate environmental distance between each site, using only PCs explaining 90% of variance
plot(hclust(D)) # we have a look at how sites cluster on the dendrogram, the we decide a number of environmental clusters.
# We see 4 main branches
CL=cutree(hclust(D), k=4) # create 4 spacial clusters based on environmental value
plot(env$longitude, env$latitude, col=CL, pch=16, main='4 Main Environmental Clusters')
### We run this function to cut each environmental cluster in 5 regions, in order to maximize geographic distance
co=0
nCL=c()
for (i in 1:4) { # for each environmental cluster
sD=dist(env[CL==i,c('longitude', 'latitude')]) # calculate distance based on geographical position
sCL=cutree(hclust(sD), k=5)
nCL[CL==i] = co+sCL
co=co+5
}
# This map returns 4 cluster of cells, based on their environmental variability.
# Furthermore, each environmental cluster is divided in 5 sub-clusters, based on geographical distance.
# We could then choose one cell per subcluster, and sample there the 20 individuals.
plot(env$longitude, env$latitude, col=CL, pch=nCL, main='Environmental Clusters with subdivision by regions')
shp$EnvGeoCL = nCL # add the clustering to the shapefile
shapefile(shp, 'kenya50kmG_CL.shp')