###### Example of Sampling design approaches ##### In these examples, we foresee of collecting 20 samples at 20 distant sites. ### The 20 sites are chosen among the 50km cells of the grids of Kenya (each cell can be a potential sampling site). library(raster) # load the raster library shp = shapefile('kenya50kmG.shp') # open the shapefile env = as.data.frame(shp) # transform to dataframe ### Approach 1) Maximize Geographical distance D=dist(env[,c('longitude', 'latitude')]) # calculate distance between each site CL=cutree(hclust(D), k=20) # create 20 spacial clusters based on position # This map returns 20 cluster of cells, based on geographical distance. # We could then choose one cell per cluster, and sample there the 20 individuals. plot(env$longitude, env$latitude, col=CL, pch=CL, main='Geographical Distance Clusters') shp$GeoCL = CL # add the clustering to the shapefile ### Approach 2) Maximize Environmental distance PCA=prcomp(env[,-c(1,21,22)]) # compress environmental information via PCA plot(cumsum(PCA$sdev)/sum(PCA$sdev), ylab='% of explained variance', xlab='PC') abline(h=0.9) # check how many PCs contain 90% of total variation D=dist(PCA$x[,1:5]) # calculate environmental distance between each site, using only PCs explaining 90% of variance CL=cutree(hclust(D), k=20) # create 20 spacial clusters based on environmental value # This map returns 20 cluster of cells, based on their environmental variability. # We could then choose one cell per cluster, and sample there the 20 individuals. plot(env$longitude, env$latitude, col=CL, pch=CL, main='Environmental Distance Clusters') shp$EnvCL = CL # add the clustering to the shapefile ### Approach 3) Maximize Environmental distance first, then maximize geographical one. # We start as in the previous case PCA=prcomp(env[,-c(1,21,22)]) # compress environmental information via PCA plot(cumsum(PCA$sdev)/sum(PCA$sdev), ylab='% of explained variance', xlab='PC') abline(h=0.9) # check how many PCs contain 90% of total variation D=dist(PCA$x[,1:5]) # calculate environmental distance between each site, using only PCs explaining 90% of variance plot(hclust(D)) # we have a look at how sites cluster on the dendrogram, the we decide a number of environmental clusters. # We see 4 main branches CL=cutree(hclust(D), k=4) # create 4 spacial clusters based on environmental value plot(env$longitude, env$latitude, col=CL, pch=16, main='4 Main Environmental Clusters') ### We run this function to cut each environmental cluster in 5 regions, in order to maximize geographic distance co=0 nCL=c() for (i in 1:4) { # for each environmental cluster sD=dist(env[CL==i,c('longitude', 'latitude')]) # calculate distance based on geographical position sCL=cutree(hclust(sD), k=5) nCL[CL==i] = co+sCL co=co+5 } # This map returns 4 cluster of cells, based on their environmental variability. # Furthermore, each environmental cluster is divided in 5 sub-clusters, based on geographical distance. # We could then choose one cell per subcluster, and sample there the 20 individuals. plot(env$longitude, env$latitude, col=CL, pch=nCL, main='Environmental Clusters with subdivision by regions') shp$EnvGeoCL = nCL # add the clustering to the shapefile shapefile(shp, 'kenya50kmG_CL.shp')