###################################### ######## The Stripey Snapper Case Study ###################################### ### Load the required packages library(raster) library(plotrix) # required for the plot_map_gradient function ### Load the Data load('StSn_SNPS.robj') # SNPS is an Robj env=shapefile('StSn_env.shp') # env is a shapefile ### Filter of Genotype Matrix ### Missingness by SNP MNsnp <- apply(SNPS, 2, function(x) { sum(is.na(x))/length(x) }) hist(MNsnp, breaks=10) # Missingness is higher than in the Moroccan example. SNPS <- SNPS[,MNsnp<0.05] # We apply a 5% threshold ### Missingness by Individual MNind <- apply(SNPS, 1, function(x) { sum(is.na(x))/length(x) }) hist(MNind, breaks=100) # Only one individual has very high missingness. SNPS <- SNPS[MNind<0.05,] # We apply a 5% threshold ### Minor Allele Frequency MAF <- apply(SNPS, 2, function(x) { a=(sum(x=='1', na.rm=T)+sum(x=='0', na.rm=T)*2)/(sum(is.na(x)==F)*2) A=(sum(x=='1', na.rm=T)+sum(x=='2', na.rm=T)*2)/(sum(is.na(x)==F)*2) return(min(a,A)) }) hist(MAF, breaks=100) # The distribution has a left skew. SNPS <- SNPS[,MAF>0.05] # We apply a 5% threshold. ### Major Genotype Frequency MGF <- apply(SNPS, 2, function(x) { aa=(sum(x=='0', na.rm=T))/(sum(is.na(x)==F)) aA=(sum(x=='1', na.rm=T))/(sum(is.na(x)==F)) AA=(sum(x=='2', na.rm=T))/(sum(is.na(x)==F)) return(max(aa,aA,AA)) }) hist(MGF, breaks=100) SNPS <- SNPS[,MGF<0.95] # We apply a 95% threshold dim(SNPS) # The filtering reduced the number of SNPS down to ~5000! Only one individuals was removed. # We lost more than half of the SNPS, which is not surprising with dataset using genome simiplicatin techniques. # We rename the filtered SNP table and we save it as an robject fSNPS=SNPS save(fSNPS, file='StSn_fSNPS.robj') # Update the environmental data removing individuals filtered out. env= env[env$ID%in%rownames(fSNPS),] # keep only rows that are found in fSNP table # Make sure that env and fSNPS are in the same order fSNPS=fSNPS[env$ID,] ### Observed Heterozygosity # We calculate the observed heterozygosity as seen previously HE_ind <- apply(fSNPS,1, function(x){return(sum(x=='1', na.rm=T)/length(x))}) # We can visualize heterozygosity on a map using the same function as we did for the Moroccan sheep. load('plot_map_gradient.Rfun') plot_map_gradient(shp=env, gradient=HE_ind, legendpos='bottomright') # Many point are superposing, to better visualize superposed points we use an additional option of the function that requires the plotrix library (loaded at the begin of the exercise). plot_map_gradient(shp=env, gradient=HE_ind, legendpos='bottomright', superposing = 'expand') # the superposing='expand' flag allows to visualize points falling under the same coordinate. # Another option that allows the visualization of superposed points is the superposing='bysite' flag. plot_map_gradient(shp=env, gradient=HE_ind, legendpos='bottomright', superposing = 'bysite', sites = env$Reefs, main='Mean Obs Heterozygosity per Site') # the superposing='expand' flag allows to visualize points falling under the same coordinate. # This option requires an additional input describing to which site do samples belong to and subsequently calculates the mean value for each of location. # We can observe that there is no particular patterns of heterozygosity across this seascape. ### Population Structure Analysis via PCA X <- fSNPS[,apply(fSNPS,2,function(x) {return(is.na(sum(x))==F)})] # filter out SNPs with missing values dim(X) # The population structure study will involve 2285 loci. PCA <- prcomp(X, scale=T) # launch the PCA ## We can have a look at how the first principal components describe our samples. par(mfrow=c(2,2)) plot(PCA$x[,1:2], xlab='PC1', ylab='PC2') plot(PCA$x[,3:4], xlab='PC3', ylab='PC4') plot(PCA$x[,5:6], xlab='PC5', ylab='PC6') plot(PCA$x[,7:8], xlab='PC7', ylab='PC8') # Hard to tell if there is a population structure, it is worth to check the plot of the & of variance explained by each principal component. par(mfrow=c(1,1)) plot(PCA$sdev/sum(PCA$sdev), ylab='% of Variance Explained', xlab='PCs') # The first two PCs clearly stand out from the global trend of the % of explained variance. # It seems as is there is a substantial population structure in this dataset. # To verify this, we look at the distribution of the first two PCs on the map. par(mfrow=c(1,1)) plot_map_gradient(shp=env, gradient=PCA$x[,1], legendpos='bottomright', superposing = 'expand') plot_map_gradient(main='PC1', shp=env, gradient=PCA$x[,1], legendpos='bottomright', superposing = 'bysite', sites = env$Reefs) plot_map_gradient(shp=env, gradient=PCA$x[,2], legendpos='bottomright', superposing = 'expand') plot_map_gradient(main='PC2', shp=env, gradient=PCA$x[,2], legendpos='bottomright', superposing = 'bysite', sites = env$Reefs) # We can see clerly the presence of a spatial pattern of the first two PCs. # PC1 stresses the differences between reefs at the very south of the coast an the rest of the reefs. # PC2 shows the contrast between north and south. # We store the PCA of the genotype matrix as an .robject save(PCA, file='StSn_PCA.robj')