############################################################################################
##########################  Spatial Genetic Variation
############################################################################################

## Load the fSNPS robject. 

load('fSNPS.robj')

# Load environmental data

env <- shapefile('MOOA_ENV_ps.shp')

### Inbreeding Calculations

## We will perform the inbreeding calculation over three different type of datasets: 1) All samples, 2) Samples above 1000 m and 3) Samples below 1000 m. 

b1000 <- env$MAR_alt<1000 # index of samples below 1000 m of altitude
a1000 <- env$MAR_alt>=1000 # index of samples above 1000 m of altitude

## Here below it is written a simple function to calculate inbreeding coefficient for one SNP.

inbreeding <- function(x) {
  obs_he <- sum(x==1, na.rm=T)/sum(table(x)) # observed heterozygosity as the number of heterozygous across all the indivduals
  p <- ((sum(x==2, na.rm=T)*2)+sum(x==1, na.rm=T))/(sum(table(x))*2) # frequency of the dominant allele
  exp_he <- 2*p*(1-p) # expected heterozygsity according to Hardy-Weinberg equilibrium
  Fs <- (exp_he - obs_he) / exp_he # inbreeding coefficient as the ratio between observed and expected heterozygosity
  return(Fs)}

## Apply the inbreeding function to each dataset

Fs_all <- apply(fSNPS, 2, inbreeding)
Fs_a1000 <- apply(fSNPS[a1000,], 2, inbreeding)
Fs_b1000 <- apply(fSNPS[b1000,], 2, inbreeding)

boxplot(Fs_all, Fs_a1000, Fs_b1000, names=c('All', 'Above 1000 m', 'Below 1000 m'), ylab='Inbreeding')

# Positive Fs indicate Inbreeding, negative Fs indicate Outbreeding. 
## RQ3) Compare the three groups? What can we say about the effect of altitude on population isolation?


### Observed Heterozygosity 

## We will now calculate the Observed Heterozygosity for each individual (frequency of heterozygous loci) and observe how this metric distributes across Morocco. 

HE_ind <- apply(fSNPS,1, function(x){return(sum(x=='1', na.rm=T)/length(x))}) # compute individual heterozygosity

## To plot the Observed Heterozygosity on a map we employ the custom function provided in the exercise folder.
load('plot_map_gradient.Rfun') # load the function in the workspace

# The function takes two mandatory arguments: the shapefile object containing the coordinates and the vector providing the gradient we want to visualize on the map. 
plot_map_gradient(shp=env, gradient = HE_ind, legendpos='bottomright') # we also add the position of the legend in the bottom right corner

## RQ4) Can you see a spatial structure? Are there sites where heterozygosity drops?

## We can check whether the heterozygosity index is correlated with any of our environmental variables:

C <- cor(cbind(HE_ind, as.data.frame(env)[,5:ncol(env)]))[,1] # calculates correlation 

barplot(C[2:length(C)], las=3)

## RQ5) Is there a variable that associates with heterozygosity?

## Save the Observed heterozygosity index as we will need it later
save(HE_ind, file='HE_ind.Robj')



### Pairwise-Fst

## In a previous example we saw that different breeds tend to occupy distinct zones of the study area. 
## Does this mean that there are breeds more or less isolated from the others? 
## We can check this by calculating the pairwise-Fst. 

## Load the two required packages (if they are not installed, look for them in the Package tab of Rstudio). 
## These are two pacakges providing functions for analysis of population structure. 
library("adegenet")
library("hierfstat")

# Load breeds data
breeds = env$Breed
names(breeds) = env$Name # assign identifier to breeds vector

breeds=breeds[rownames(fSNPS)] # sort breed vector with same order as fSNPS table

# Create a GenInd object. This is the required dataformat for using the functions from the adegenet and hierfstat packages. 
# Since these calculations can be rather slow, we randomly pick 3000 snps from the fSNPS matrix, in order to speed up calculations. 

GenInd <- df2genind(fSNPS[,sample(1:ncol(fSNPS), 3000)], ploidy = 2, ind.names = rownames(fSNPS), pop = breeds, sep = "")

# To calculate the Fst, we run the following command. This might take a few minutes. 
genet.dist(GenInd, method = "WC84") 
# WC84 indicates the formula used to calculate Fst, as there are several that have been proposed over the years. This one is from a Weir and Cockerham paper from '84. 

# Fst should range between 0 and 1 (negative values should be interpreted as 0). 
# With a Fst close to 0, there is weak or no differntiation between a pair of populations, while an Fst close to 1 indicates very restricted gene flow between the populations.

# RQ6) What can we say about the Moroccan breeds? Are they isolated between each other? 


## Save your Rstudio project. 


