############################################################
##########################  Morocco Case Study: preprocessing for SamBada
############################################################
### Load required packages

library(raster)

### Load required inputs

load('fSNPS.robj')
env=shapefile('MOOA_ENV_ps.shp')

## We transform the env shapefile object in a dataframe and we index the rows with name of the samples
## This will allow us to easily ensure that genetic and environmental information are in the same order

env.df = as.data.frame(env)
rownames(env.df) = env$Name




### Create the Genetic Input for SamBada

# The SNP matrix is transformed in three binary matrices indicating presence of absence of the three possible genotypes. 
sGT = cbind(fSNPS==0, fSNPS==1, fSNPS==2)+0 # the +0 transforms TRUE and FALSE in 1 and 0.
colnames(sGT) = paste0(rep(colnames(fSNPS), times=3), rep(c('_0','_1','_2'), each=ncol(fSNPS))) # add unique identifier for each genotype

# The new matrix is indeed 3x the number of columns of the genotype matrix
dim(fSNPS)
dim(sGT)

# You can check the difference between the top-left corner of  two matrices:

fSNPS[1:5,1:5] # 3 genotypes are possible
sGT[1:5,1:5] # all the 0 genotypes are marked as 1, otherwise as 0. Notice the headers have been modified. 

## We can finally write this matrix to a file, adding a Name column containg the identifiers 
sambada_mol = cbind("Name"=rownames(sGT), sGT)
write.table(sambada_mol, file = 'mol-data-MOOA.txt', col.names=T, row.names=F, quote=F)




### Create the Environmental Input for SamBada

# It is crucial that genetic and environmental input have the same samples in the same order. 

env.df = env.df[rownames(fSNPS),] # env.df is now correctly sorted

# If we look at what is included in the env.df dataframe we can see that there are variables that we do not need.
colnames(env.df)

sENV=env.df[,5:29] # we exclude ids, coordinates, breed information

# We now want to group environmental variables that are correlated. 
# To do so, we use a custom function provided with the exercise data.

load('groupENV.Rfun')
correlation_groups = groupENV(sENV, 0.7) # 0.7 is the cut-off value that we claim  two variables as correlated. 

correlation_groups # is a list showing the groups of correlated variables. 
# Each group has a key member (indicated after the $ sign) and the correlated variable (in quote)
# If the quotes are empty, the variable does not correlate with any other. 

length(correlation_groups) # we reduced the number of variables down to 14

# We use the key members to produce the environmental matrix for SamBada.

sENV_gr = sENV[, names(correlation_groups)]


## Similarly to what we did with the genetic data, we store the environmental matrix in a file. 
sambada_env = cbind("Name"=rownames(sENV_gr), sENV_gr)
write.table(sambada_env, file = 'env-data-MOOA.txt', col.names=T, row.names=F, quote=F)






### Create the Parameter Files for SamBada

# We can finally create the Parameter file for the SamBada run. 

# Based on the indications on the exercise sheet and on the matrix produced in this R session, try to manually write your parameter file. 

## Once you have written your parameter file, you can load the R function to automatically compute it. 
load('createParam.Rfun')

# Note that this function is mainly conceived for the purposes of this exercise and does not account most of the possible paramters that can be set in SamBada.

# We can create the paramter file running the function
createParam(mol=sambada_mol, env=sambada_env, idcol='Name', ps=0, filename = 'param-MOOA.txt')
# the idcol flag indicates the name of the column with the IDs, the ps option the number of variables describing the population structure (here none).

# The parameter file is now available in the working directory. Check if this matches with what you wrote previously. 


