######################################
######## The Stripey Snapper Case Study
######################################
### Load the required packages

library(raster)
library(plotrix) # required for the plot_map_gradient function

### Load the Data

load('StSn_SNPS.robj') # SNPS is an Robj
env=shapefile('StSn_env.shp')   # env is a shapefile


### Filter of Genotype Matrix

### Missingness by SNP
MNsnp <- apply(SNPS, 2, function(x) { sum(is.na(x))/length(x) })
hist(MNsnp, breaks=10) # Missingness is higher than in the Moroccan example. 
SNPS <- SNPS[,MNsnp<0.05] # We apply a 5% threshold

### Missingness by Individual
MNind <- apply(SNPS, 1, function(x) { sum(is.na(x))/length(x) })
hist(MNind, breaks=100) # Only one individual has very high missingness. 
SNPS <- SNPS[MNind<0.05,] # We apply a 5% threshold

### Minor Allele Frequency
MAF <- apply(SNPS, 2, function(x) {
  a=(sum(x=='1', na.rm=T)+sum(x=='0', na.rm=T)*2)/(sum(is.na(x)==F)*2)
  A=(sum(x=='1', na.rm=T)+sum(x=='2', na.rm=T)*2)/(sum(is.na(x)==F)*2)
  return(min(a,A))
})
hist(MAF, breaks=100) # The distribution has a left skew.
SNPS <- SNPS[,MAF>0.05] # We apply a 5% threshold. 

### Major Genotype Frequency
MGF <- apply(SNPS, 2, function(x) {
  aa=(sum(x=='0', na.rm=T))/(sum(is.na(x)==F))
  aA=(sum(x=='1', na.rm=T))/(sum(is.na(x)==F))
  AA=(sum(x=='2', na.rm=T))/(sum(is.na(x)==F))
  return(max(aa,aA,AA))
})
hist(MGF, breaks=100) 
SNPS <- SNPS[,MGF<0.95] # We apply a 95% threshold 


dim(SNPS) # The filtering reduced the number of SNPS down to ~5000! Only one individuals was removed.
# We lost more than half of the SNPS, which is not surprising with dataset using genome simiplicatin techniques. 

# We rename the filtered SNP table and we save it as an robject
fSNPS=SNPS
save(fSNPS, file='StSn_fSNPS.robj')


# Update the environmental data removing individuals filtered out.

env= env[env$ID%in%rownames(fSNPS),] # keep only rows that are found in fSNP table

# Make sure that env and fSNPS are in the same order

fSNPS=fSNPS[env$ID,]

### Observed Heterozygosity 

# We calculate the observed heterozygosity as seen previously
HE_ind <- apply(fSNPS,1, function(x){return(sum(x=='1', na.rm=T)/length(x))}) 

# We can visualize heterozygosity on a map using the same function as we did for the Moroccan sheep. 
load('plot_map_gradient.Rfun')

plot_map_gradient(shp=env, gradient=HE_ind, legendpos='bottomright')

# Many point are superposing, to better visualize superposed points we use an additional option of the function that requires the plotrix library (loaded at the begin of the exercise).

plot_map_gradient(shp=env, gradient=HE_ind, legendpos='bottomright', superposing = 'expand') # the superposing='expand' flag allows to visualize points falling under the same coordinate. 

# Another option that allows the visualization of superposed points is the superposing='bysite' flag. 

plot_map_gradient(shp=env, gradient=HE_ind, legendpos='bottomright', superposing = 'bysite', sites = env$Reefs, main='Mean Obs Heterozygosity per Site') 
# the superposing='expand' flag allows to visualize points falling under the same coordinate. 
# This option requires an additional input describing to which site do samples belong to and subsequently calculates the mean value for each of location. 
# We can observe that there is no particular patterns of heterozygosity across this seascape. 


### Population Structure Analysis via PCA

X <- fSNPS[,apply(fSNPS,2,function(x) {return(is.na(sum(x))==F)})] # filter out SNPs with missing values

dim(X) # The population structure study will involve 2285 loci.

PCA <- prcomp(X, scale=T) # launch the PCA

## We can have a look at how the first principal components describe our samples.

par(mfrow=c(2,2))
plot(PCA$x[,1:2], xlab='PC1', ylab='PC2')
plot(PCA$x[,3:4], xlab='PC3', ylab='PC4') 
plot(PCA$x[,5:6], xlab='PC5', ylab='PC6') 
plot(PCA$x[,7:8], xlab='PC7', ylab='PC8')

# Hard to tell if there is a population structure, it is worth to check the plot of the & of variance explained by each principal component. 
par(mfrow=c(1,1))
plot(PCA$sdev/sum(PCA$sdev), ylab='% of Variance Explained', xlab='PCs') 

# The first two PCs clearly stand out from the global trend of the % of explained variance.
# It seems as is there is a substantial population structure in this dataset.
# To verify this, we look at the distribution of the first two PCs on the map. 

par(mfrow=c(1,1))
plot_map_gradient(shp=env, gradient=PCA$x[,1], legendpos='bottomright', superposing = 'expand') 
plot_map_gradient(main='PC1', shp=env, gradient=PCA$x[,1], legendpos='bottomright', superposing = 'bysite', sites = env$Reefs) 

plot_map_gradient(shp=env, gradient=PCA$x[,2], legendpos='bottomright', superposing = 'expand') 
plot_map_gradient(main='PC2', shp=env, gradient=PCA$x[,2], legendpos='bottomright', superposing = 'bysite', sites = env$Reefs) 


# We can see  clerly the presence of a spatial pattern of the first two PCs.
# PC1 stresses the differences between reefs at the very south of the coast an the rest of the reefs. 
# PC2 shows the contrast between north and south. 

# We store the PCA of the genotype matrix as an .robject
save(PCA, file='StSn_PCA.robj')