###### Example of Sampling design approaches

##### In these examples, we foresee of collecting 20 samples at 20 distant sites. 
### The 20 sites are chosen among the 50km cells of the grids of Kenya (each cell can be a potential sampling site).


library(raster) # load the raster library

shp = shapefile('kenya50kmG.shp') # open the shapefile
env = as.data.frame(shp) # transform to dataframe

### Approach 1) Maximize Geographical distance 

D=dist(env[,c('longitude', 'latitude')]) # calculate distance between each site

CL=cutree(hclust(D), k=20) # create 20 spacial clusters based on position


# This map returns 20 cluster of cells, based on geographical distance.
# We could then choose one cell per cluster, and sample there the 20 individuals. 
plot(env$longitude, env$latitude, col=CL, pch=CL, main='Geographical Distance Clusters')

shp$GeoCL = CL # add the clustering to the shapefile
  

### Approach 2) Maximize Environmental distance 

PCA=prcomp(env[,-c(1,21,22)]) # compress environmental information via PCA

plot(cumsum(PCA$sdev)/sum(PCA$sdev), ylab='% of explained variance', xlab='PC')
abline(h=0.9) # check how many PCs contain 90% of total variation 

D=dist(PCA$x[,1:5]) # calculate environmental distance between each site, using only PCs explaining 90% of variance

CL=cutree(hclust(D), k=20) # create 20 spacial clusters based on environmental value


# This map returns 20 cluster of cells, based on their environmental variability.
# We could then choose one cell per cluster, and sample there the 20 individuals. 
plot(env$longitude, env$latitude, col=CL, pch=CL, main='Environmental Distance Clusters')

shp$EnvCL = CL # add the clustering to the shapefile


### Approach 3) Maximize Environmental distance first, then maximize geographical one.

# We start as in the previous case

PCA=prcomp(env[,-c(1,21,22)]) # compress environmental information via PCA
plot(cumsum(PCA$sdev)/sum(PCA$sdev), ylab='% of explained variance', xlab='PC')
abline(h=0.9) # check how many PCs contain 90% of total variation 
D=dist(PCA$x[,1:5]) # calculate environmental distance between each site, using only PCs explaining 90% of variance

plot(hclust(D)) # we have a look at how sites cluster on the dendrogram, the we decide a number of environmental clusters. 
# We see 4 main branches 
CL=cutree(hclust(D), k=4) # create 4 spacial clusters based on environmental value

plot(env$longitude, env$latitude, col=CL, pch=16, main='4 Main Environmental Clusters')

### We run this function to cut each environmental cluster in 5 regions, in order to maximize geographic distance 
co=0
nCL=c()
for (i in 1:4) { # for each environmental cluster

  sD=dist(env[CL==i,c('longitude', 'latitude')]) # calculate distance based on geographical position
  sCL=cutree(hclust(sD), k=5) 
  nCL[CL==i] = co+sCL
  co=co+5
}

# This map returns 4 cluster of cells, based on their environmental variability.
# Furthermore, each environmental cluster is divided in 5 sub-clusters, based on geographical distance. 
# We could then choose one cell per subcluster, and sample there the 20 individuals. 
plot(env$longitude, env$latitude, col=CL, pch=nCL, main='Environmental Clusters with subdivision by regions')

shp$EnvGeoCL = nCL # add the clustering to the shapefile

shapefile(shp, 'kenya50kmG_CL.shp')