###############################################################################################################
##############################  Multivariate Analysis of Environment
###############################################################################################################

## Load required packages

library(raster)

## Load the samples environemntal data. We will work values obtained with the Point Sampling method. 

MAR_env <- shapefile('./MOOA_ENV_ps.shp')

names(MAR_env) # The first 4 columns (Name, Breed, Longitude and Latitude) do not account for the environmental variation and will be excluded.

X <- as.data.frame(MAR_env)[,5:(ncol(MAR_env))] # create a table only with the environmental values

#######################
### Explore the dataset
#######################
## A first glance at the data can be provided by the HEATMAP function. 
## This provides an overview on how the variables and the samples are correlated between each other. 

heatmap(as.matrix(X))

# RQ1) Can you tell which variables are more or less correlated? What is the problem?
 
boxplot(X) # The variables are on different scale. 

## When performing multivariate analysis, it is essential that all the variables we want to compare are on the same scale. 

Z <- scale(X)

# now all the variables are centered to 0 and scale to standard deviation 1.

heatmap(Z)

## Now the heatmap provides a more interesting result.
## If we observe the plot by columns we can see which environmental variables are more related between each other.
## If we observe the plot by rows we observe the relationship between samples. 
## RQ2) What environmental variables are more strongly correlated? Try to explain why. 

################################
### Principal Component Analysis
################################

## The heatmap revelaed that in fact several variables are strongly correlated between each others.
## The Principal Component Analysis (PCA) allows to reduce the data complexity.

PCA <- prcomp(Z) # performs a PCA.

# The PCA object has 5 items, we are interested in the follwing: 

PCA$rotation # The rotation matrix. It contains the information concering which environmental variables explains samples differentiation

PCA$x # The PC matrix. It contains the information on how samples are related.

PCA$sdev # The standard deviation vector. It contains the information on how much variation is explained by each principal component. 

## Here below we show how to interepret a PCA on environmental data.

# RQ3) First of all, have a look on a heatmap of the Principal Components matrix. What do you observe? Why?

heatmap(PCA$x)

# Next, have a look on how the first principal component distributes across the study area. 
# For this, we use a custom R function that we can load with this code. 
load('plot_map_gradient.Rfun')
# This function plots a gradient across a map. 
plot_map_gradient(MAR_env, gradient = PCA$x[,1])

# RQ4) What regions appear as contrasted? (Hint: think of Morocco topography)
# Next, we have a look at the amount of variation explained by the first principal component. 

PCA$sdev[1]/sum(PCA$sdev)*100 # This is the perecentage explained.




# Remember that principal components are classed by the amount of variance that they explain.
# This mean that the first principal component is the one that explain most of the variance. 

plot(PCA$sdev/sum(PCA$sdev)*100, xlab='PC#', ylab='% of variance explained') # You can also observe this on a graph. 
# You can see that the first three principal components almost half of the total environmental variance. 

# Finally, we have a look at the variables responsible for this characterization. 

barplot(PCA$rotation[,1], las=3)

# RQ5) What variables are involved? By looking at their geographic distribution, does that make sense? 

## Repeat the interpretation for the second and the third principal component. 


###########################
### Hierarchical Clustering
###########################

## The heatmap analysis that we used in the first part of the exercise allowed us to class samples by environmental conditions.
## The approach we used has a major drawback: it does not account for correlated variables. 
## A better approach is to perform this analysis based on the Principal Components, since they are, by definition, not correlated. 

DM <- dist(PCA$x) # to begin, we calculate the distance between each sample, based on their value in the Principal Component matrix.

HC <- hclust(DM) # then we calculate a dendrogram based on these distances.

plot(HC) # We can visualize it to see how samples groups together by environmental similarity. 

# Then we can divide this classification in discrete groups. 

cl2 <- cutree(HC, 2) # divides samples in 2 clusters
cl3 <- cutree(HC, 3) # divides samples in 3 clusters
cl4 <- cutree(HC, 4) # divides samples in 4 clusters

# We can now export our results in a shapefile and go back to QGIS to interprete the results. 

outputSP <- cbind(MAR_env, PCA$x, cl2, cl3, cl4)
names(outputSP)[(ncol(outputSP)-2):ncol(outputSP)] <- c('cl2', 'cl3', 'cl4')
shapefile(outputSP, './MOOA_ENV_ps_MV.shp', overwrite=T)

