##########################################################################################################################################################
########################              Post Processing of SamBada Results: Australia Case Study          ##################################################
##########################################################################################################################################################
### Load the required packages

library(qvalue)
library(raster)
library(data.table) 
library(plotrix)

### Load the required data

load('StSn_fSNPS.robj')
load('StSn_PCA.robj') # load the variables encoding population structure

env=data.frame(shapefile('StSn_env.shp')) 
rownames(env)=env$ID
env=env[rownames(fSNPS),] # we remove samples that were not kept during the genetic filtering

SAM3 = fread('mol-data-StSn-Out-3.txt') # we now read the output with the highest number of variables: 3-variate models

# When running multivariate models, SamBada builds all the possible combination of variables including those between environmental descriptors.
# We are only interested in tri-variate models carrying the two population structure variables + one environmental variable.
# We can filter SamBada output with:
SAM3 = SAM3[SAM3$Env_2=='PS1'&SAM3$Env_3=='PS2',] 


############# Calculate Pvalues
### We are using again the Gscore, but in this case it will compare a null model having only population structure variables and an alternative one having population structure + one environmental variable. 
### When a population shows genetic structure, the rate of false positives associations tends to increase. 
### For this reason, we add a second statistic to filter significant models: the Waldscore
### Waldscore test reveals whether the effect of the variable added in the alternative model (i.e. the effect of the environmnetal variable) is different from 0. 

# Compute P-value for Gscore and Waldscore

SAM3$pvalueG <- 1-pchisq(SAM3$GscorePop, df=1)
SAM3$pvalueW <- 1-pchisq(SAM3$WaldScorePop, df=1)

# Note that we are using the columns called GscorePop and WaldScorePop (instead of Gscore and Waldscore). 
# This is because we are running SamBada with the POPULATIONVAR flag on. When you use SamBada with population structure variables, always use these two columns to retrieve the model statistics. 

############# Calculate Qvalues
# Load the qvalues_by_env function
load(file='qvalues_by_env.Rfun')

# As in the previous example, we compute qvalues by environmental variable.
SAM3$qvalueG = qvalues_by_env(pvalues = SAM3$pvalueG, envindex = SAM3$Env_1)
SAM3$qvalueW = qvalues_by_env(pvalues = SAM3$pvalueW, envindex = SAM3$Env_1)

# We can now check for significant models my thresholding qvalues of both statistics. 

SAM3[SAM3$qvalueG<0.2&SAM3$qvalueW<0.2,] # 5 significant models


############### Compress the Results
load('showbest.Rfun')

# We can then compress the results in order to have only the best association for each gene.
showbest(sign = SAM3[SAM3$qvalueG<0.2&SAM3$qvalueW<0.2,]  , criteria = 'qvalueG')
# We have found 3 SNPs with significant association. 

############### Visualize association on a map
load('plot_map_gradient.Rfun')

# We now focus on a particular association, the one between snp009567_0 and Tm006 which appears to be very strong. 
# Tm is the average sea surface temperature in the month of June. 
# This time we add two additional maps to our display: those concerning genetic structure.
par(mfrow=c(2,2)) 
plot_map_gradient(x=env$coords.x1, y=env$coords.x2, gradient = (fSNPS[,'snp009567']), br = 3, main='Genotype', superposing = 'bysite', sites = env$Reefs) # green= genotype_2, brown= genotype=1, red= genotype_0
plot_map_gradient(x=env$coords.x1, y=env$coords.x2, gradient = env$Tm006, main='Tm006')
plot_map_gradient(x=env$coords.x1, y=env$coords.x2, gradient = PCA$x[rownames(fSNPS),1], main='PC1', superposing = 'bysite', sites = env$Reefs)
plot_map_gradient(x=env$coords.x1, y=env$coords.x2, gradient = PCA$x[rownames(fSNPS),2], main='PC2', superposing = 'bysite', sites = env$Reefs)

# RQ6) Is there a genetic structure variable that might confound with Tm006? Does this association seem plausible to you?

################# Visualize association on a boxplot

par(mfrow=c(1,1))
boxplot(env$Tm006~fSNPS[,'snp009567'], xlab='genotypes', ylab='Tm006')

# RQ7) How do you think selection might act on this SNP? 

dim(fSNPS)
################ Retrieve nucleotide sequence around the SNP
# Stripey Snapper does not have a genome reference, for this reason we don't know how the SNPs are ordered across the genome.
# For studies of this kind, the maximal information that you can get is the nucleotide sequence in which the snp falls. 
# This sequence is usually very short (60-80 bases), but in some cases can be a valuable source of information. 
# The sequence information is provided in the StSn_SNPseq.txt table.

seq=read.table('StSn_SNPseq.txt', row.names = 1, header = T)

# You can see that for each SNP there is a nucleotide sequence associated, corresponding to the location where the SNP was discovered. 
head(seq)

# For instance, we can retrieve the sequence of our SNP of interest with:
seq['snp009567','seq']

# Take note of this sequence, than go back to the exercise sheet. 

