##########################################################################################################################################################
########################              Post Processing of SamBada Results: Morocco Case Study          ##################################################
##########################################################################################################################################################
### Load the required packages

# For this exercise we need the qvalue package from BioconductoR.
# This package can not be found from the Install menu of Rstudio. 
# To download the value package, run this code:

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("qvalue", version = "3.8")

# If an update is asked, answer "n" (no)

# load the libarary

library(qvalue)
library(raster)
library(data.table) # install from Rstudio package manager, allows to handle large tables

### Load the required data

load('fSNPS.robj') # the genotype table

env=data.frame(shapefile('MOOA_ENV_ps.shp')) # we convert the shape file to a dataframe to facilitate indexing of rows
rownames(env)=env$Name
env=env[rownames(fSNPS),] # we remove samples that were not kept during the genetic filtering

SAM1 = fread('mol-data-MOOA-Out-1.txt') # fread is a function to read large tables in R (requires data.table package)

############# Calculate Pvalues
### In this first exercise we focus on one statistics associated to SamBada models: G-score. 
# G-score is the result of a comparison between two models: the null model (no environmental variables) and the alternative one. If the p-value associated to this statistic is small, we reject the hypothesis that the two models are the same (in other words, the environmental variable "adds" something to the model).

# Compute P-value for Gscore

SAM1$pvalueG <- 1-pchisq(SAM1$Gscore, df=1)

############# Calculate Qvalue
# We can now observe how many models resulted in a p-value smaller than 0.01. 
length(which(SAM1$pvalueG<0.01))
# A p-value threshold of 0.01 means that when testing 100 false associations we accept that one of them can result as significant by chance.
# The problem is that we tested more than a million of models, and therefore we have thousands of significant ones at this thereshold. 
# We therefore need to correct the p-values and we do so by using the q-value method from Storey. 

# The qvalue correction relies on the distribution of the pvalues. We can visualize this globaly:
hist(SAM1$pvalueG, breaks=100, xlab='pvalues')
# We can see that the distribution has a left skew, meaning that there are more significant models than expected by chance. If all models were false, we would observe a flat distribution. 
# It is important to consider that models associated to different environmental variables does not have the same shape of the distribution. You can check this here below:
par(mfrow=c(1,2))
hist(SAM1$pvalueG[SAM1$Env_1=='MAR_hill'], breaks=100, main='Hillshade', xlab='pvalues') # P-value distribution for Hillshade-associated models
hist(SAM1$pvalueG[SAM1$Env_1=='bioclim7'], breaks=100, main='bioclim7', xlab='pvalues') # P-value distribution for bioclim7-associated models (temperature annual range).
# RQ1) For which of the two variables do you expect more true associations? 


# Load the qvalues_by_env function
load(file='qvalues_by_env.Rfun')

# This function computes the qvalue calculation separating models by the environmental they are associated to. 
SAM1$qvalueG = qvalues_by_env(pvalues = SAM1$pvalueG, envindex = SAM1$Env_1)

# Now we can filter the models by qvalue. We can for example apply a threshold of q=0.2  to filter models.

SAM1[SAM1$qvalueG<0.2,] # 27 significant models

# A q-threshold of 0.2 means that 20% of the models below this cut-off value are false positives. In our case, out of 27 models, ~22 models are expected to be true positives. 
# This cut-off value is quite permissive, but remember that we have reduced the enviromental data excluding correlated variables. 
# RQ2) If you think of the way we prepared environmental variables, why do you think it is important to use a permissive q-value threshold at this step?

############### Compress the Results
# One SNP can have three genotype and each genotype can match to multiple enviromental variables. For this reason, for a same SNP, we can have multiple significant models.
# To facilitate results interpretation, it could be useful just to pick the best one for each marker. 
# Load the showbest function.
load('showbest.Rfun')

# This function process the table of significant models and returns only the best on for each SNP, based on a statistic of interest (in our case, qvalueG).
showbest(sign = SAM1[SAM1$qvalueG<0.2,], criteria = 'qvalueG')
# We can see that we reduced the number of models down to 22. 


############### Visualize association on a map
# A good way of checking whether an association is plausible is to observe it on a map. Meaningful associations should display a superposition of the environmental gradient and the frequency of the genotype. 
# We can use again the function to plot gradients. 
load('plot_map_gradient.Rfun')

# We focus on a peculiar association, for instance the one between genotype 23:44071708_2 and the variable bioclim19 (precipitations in coldest quarter).

# This time insted of using the shapefile we specify the x and y coordinates. 
par(mfrow=c(1,2)) 
plot_map_gradient(x=env$Longitude, y=env$Latitude, gradient = (fSNPS[,'23:44071708']), br = 3, main='Genotype') # green= genotype_2, brown= genotype=1, red= genotype_0
plot_map_gradient(x=env$Longitude, y=env$Latitude, gradient = env$bioclim19, main='Bioclim19')

# RQ3) Try to interpret this plot. Does this seem like a plausible association?

################# Visualize association on a boxplot
# Another way of studying an association is to observe it on a boxplot contrasting genotypes and environmental variable. 
par(mfrow=c(1,1))
boxplot(env$bioclim19~fSNPS[,'23:44071708'], xlab='genotypes', ylab='bioclim19')

# This plot confirms what we saw in the previous one: genotype 2 appears as more frequent under high values of bioclim19 (precipitations in coldest quarter).
# An important information that we can retrieve from this graph is how selection might act. 
# RQ4) If you consider alleles (0=aa, 1=aA, 2=AA), how do you think seletion might act on this snp? Does selection target the genotype or the allele?


################ Visualize association on a genetic space
# As you noticed, the id of each marker provides us information about its position on the sheep genome. 
# For instance, the marker we are focusing on 23:44071708 is at position 44071708 on the 23rd chromosome.
# It is important to consider the genetic resolution we are working at since sometimes SNPs results as significant only because they are close to the real SNPs under selection (phenomenon known as genetic linkage).
# When possible, it is worth to check how the genetic neighborhood of a SNP associates to a variable of interest. 

# To check this, we can use the custom R function get.gen.position. 
load('get.gen.position.Rfun')

# We specify that we are interested in chromosome 23 and the variale bioclim19. We also set the position of the SNP and that we want to visualize a window of 1MB around this SNP.

bioclim19.23 = get.gen.position(fSNPS = fSNPS, chromosome = 23, envvar='bioclim19', SAM=SAM1, pVAL = 'pvalueG', position=44071708, window=10^7)

# The plot resulting from this code shows the log transformed p-values in this genetic neighborhood. 
# RQ5) Where do you think is the mutation linked to bioclim19?

# The bioclim19.23 table contains the position of all the genetic markers of chromosome 23, as well as the pvalues of to the three genotype of each marker associated to bioclim19. 

head(bioclim19.23)

# Note chromosome and position of the genetic marker of interest and go back to the exercise sheet to continue the interpretation of results. 



