Principal Components Analysis and Plot From SNP Data

(1) Loading in packages & data

# Set working directory
setwd("/Users/amakukhov/Desktop/ComputationalBiology_SPRING2017/Bio381")

# First need to download Bioconductor, which is an open source software that uses R programming language to analyze high-throughput genomic data.
source("https://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
# Installing SNPRelate from Bioconductor
biocLite("SNPRelate")
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.2 (2016-10-31).
## Installing package(s) 'SNPRelate'
## 
## The downloaded binary packages are in
##  /var/folders/l7/l8tb9msd1757m7h3gvtwghmc0000gn/T//Rtmp7yHo76/downloaded_packages
# Load these packages (gds=genomic data structures)
library(SNPRelate)
## Loading required package: gdsfmt
## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
library(gdsfmt)

# Read in the VCF (Variant Call Format) file, a common file structure for single nucleotide polymorphism, or "SNP", data
OASV2rmdupvcf <- ("OASV2-rmdup-Bio381-filtered.recode.vcf")

(2) Reformatting the VCF file into SNPRelate’s native “Genomic Data Structure (gds)”

# Reformatting to GDS considering a 2-allele only model (standard)
snpgdsVCF2GDS(OASV2rmdupvcf, "OASV2gds", method="biallelic.only") 
## VCF Format ==> SNP GDS Format
## Method: exacting biallelic SNPs
## Number of samples: 16
## Parsing "OASV2-rmdup-Bio381-filtered.recode.vcf" ...
## Warning in (function (con = stdin(), n = -1L, ok = TRUE, warn = TRUE,
## encoding = "unknown", : incomplete final line found on 'OASV2-rmdup-Bio381-
## filtered.recode.vcf'
##  import 120128 variants.
## + genotype   { Bit2 16x120128, 469.2K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file 'OASV2gds' (1.2M)
##     # of fragments: 57
##     save to 'OASV2gds.tmp'
##     rename 'OASV2gds.tmp' (1.2M, reduced: 444B)
##     # of fragments: 20
# Ignore the warning above, it is just because it didn't like how I parsed out some of the SNP data because I didn't want to share all of my dataset and using the whole dataset takes longer to run. Not necessary for this example.

# Open the reformatted file and name it
OASV2gds <- snpgdsOpen("OASV2gds")

# Get a summary of the dataset
snpgdsSummary("OASV2gds")
## The file name: /Users/amakukhov/Desktop/ComputationalBiology_SPRING2017/Bio381/OASV2gds 
## The total number of samples: 16 
## The total number of SNPs: 120128 
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

(3) Running PCA on all SNPs and all individual samples

# Note: in this case individuals = a pooled population
# Speed up computation by requesting multiple threads via the num.thread command
# Use autosome.only=FALSE to account for the fact that we are working with non-human organisms (so not 23 chromosomes)

pca <- snpgdsPCA(OASV2gds, autosome.only=FALSE, num.thread=4)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 16 samples, 120,128 SNPs
##     using 4 (CPU) cores
## PCA: the sum of all selected genotypes (0, 1 and 2) = 2982942
## Sun Apr 16 16:24:48 2017    (internal increment: 22272)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed      
## Sun Apr 16 16:24:48 2017    Begin (eigenvalues and eigenvectors)
## Sun Apr 16 16:24:48 2017    Done.
# Create a vector of population IDs for being able to identify via color coding in the plot
# NOTE: Need to make sure these IDs match the order as in the original vcf file
popcode <- c("D1_7.5", "D1_7.5", "D1_7.5", "D1_7.5", "D1_8.0", "D1_8.0", "D1_8.0", "D1_8.0", "D7_7.5", "D7_7.5", "D7_7.5", "D7_7.5", "D7_8.0", "D7_8.0", "D7_8.0", "D7_8.0")

# Getting strucutre of PCA, which provides info on the PCs
str(pca)
## List of 8
##  $ sample.id: chr [1:16] "OASV2_DNA_D1_7_5_S_03" "OASV2_DNA_D1_7_5_S_04" "OASV2_DNA_D1_7_5_S_05" "OASV2_DNA_D1_7_5_S_07" ...
##  $ snp.id   : int [1:120128] 1 2 3 4 5 6 7 8 9 10 ...
##  $ eigenval : num [1:16] 1.11 1.03 1.03 1.02 1.02 ...
##  $ eigenvect: num [1:16, 1:16] 0.1058 -0.0993 0.3472 -0.1185 0.4619 ...
##  $ varprop  : num [1:16] 0.0738 0.0689 0.0685 0.0682 0.0677 ...
##  $ TraceXTX : num 2842153
##  $ Bayesian : logi FALSE
##  $ genmat   : NULL
##  - attr(*, "class")= chr "snpgdsPCAClass"

(4) Create a biplot of the first 2 PCA axes & color code according to population

plot(pca$eigenvect[,1],pca$eigenvect[,2], main="PCA with quality filtered SNPs", cex.main=2, xlab="Principal Component axis 1", cex.lab=1.5, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("bottomleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))

# Notice that the legend is in the way of some of the points. Can correct this by moving it:
plot(pca$eigenvect[,1],pca$eigenvect[,2], main="PCA with quality filtered SNPs #2", cex.main=2, xlab="Principal Component axis 1", cex.lab=1.3, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("topleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))

(5) Considering Linkage Disequilibrium (LD)

It’s worth considering how inference of population structure via PCA might be sensitive to your choice of SNPs (e.g., Linkage Disequilibrium, or ‘LD’). You can filter for SNPs that are not in high LD with others, and then re-run the PCA and plot.

# This prunes any SNPs with an r^2 > 0.2 between adjacent SNPs
snpset <- snpgdsLDpruning(OASV2gds, autosome.only=FALSE, ld.threshold=0.2)
snpset.id <- unlist(snpset)
# Now we run PCA again on the pruned data with no LD and plot (user-defined LD threshold):
pca_noLD <- snpgdsPCA(OASV2gds, snp.id=snpset.id, autosome.only=FALSE, num.thread=4)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 16 samples, 4,053 SNPs
##     using 4 (CPU) cores
## PCA: the sum of all selected genotypes (0, 1 and 2) = 101129
## Sun Apr 16 16:24:51 2017    (internal increment: 22272)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed      
## Sun Apr 16 16:24:51 2017    Begin (eigenvalues and eigenvectors)
## Sun Apr 16 16:24:51 2017    Done.
plot(pca_noLD$eigenvect[,1],-1*(pca_noLD$eigenvect[,2]), main="PCA with no SNPs in LD", xlab="Principal Component axis 1", cex.lab=1.5, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("topleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))

# Saving PDF of plot
pdf(file="OASV2 PCA with quality filtered SNPs.pdf", height=5, width=8)

plot(pca$eigenvect[,1],pca$eigenvect[,2], main="PCA with quality filtered SNPs #2", cex.main=2, xlab="Principal Component axis 1", cex.lab=1.3, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("topleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))

dev.off()
## quartz_off_screen 
##                 2