R Markdown

Rice Data Download the Rice Diversity Panel data RiceDiversity.44K.MSU6. Genotypes_PLINK.zip from http://ricediversity.org/data/sets/44kgwas/. Data is in PLINK format. We will use the read_ped function from the BGLR package to read the PLINK format files into the R environment.

    library(BGLR)
    out<- read_ped("C:/Users/wasim/Desktop/Lectures/Practical/RiceDiversity.44K.MSU6.Genotypes_PLINK/RiceDiversity_44K_Genotypes_PLINK/sativas413.ped")
    p=out$p
    n=out$n
    out=out$x

Recode snp to 0,1,2 format using allele

out[out==2]=NA
    out[out==3]=2
    W <- matrix(out, nrow=p, ncol=n, byrow=TRUE)
    W <- t(W)
    dim(W)
## [1]   413 36901

Accession IDs The accession IDs (individual IDs) are included in .fam file.

fam <-read.table("C:/Users/wasim/Desktop/Lectures/Practical/RiceDiversity.44K.MSU6.Genotypes_PLINK/RiceDiversity_44K_Genotypes_PLINK/sativas413.fam")  
    head(fam)
##           V1 V2 V3 V4 V5 V6
## 1 081215-A05  1  0  0 -9 -9
## 2 081215-A06  3  0  0 -9 -9
## 3 081215-A07  4  0  0 -9 -9
## 4 081215-A08  5  0  0 -9 -9
## 5 090414-A09  6  0  0 -9 -9
## 6 090414-A10  7  0  0 -9 -9
    rownames(W) <- fam$V2

Phenotype data We will use the read.table function to read the phenotype file RiceDiversity_44K_Phenotypes_34traits_PLINK.txt from here. phenotypes

rice.pheno <- read.table("http://www.ricediversity.org/data/sets/44kgwas/RiceDiversity_44K_Phenotypes_34traits_PLINK.txt", header=TRUE, stringsAsFactors = FALSE, sep = "\t")
    table(rownames(W) == rice.pheno$NSFTVID)
## 
## TRUE 
##  413
    y <- matrix(rice.pheno$Flowering.time.at.Arkansas) # # use the first trait 
    rownames(y) <- rice.pheno$NSFTVID
    index <- !is.na(y)
    y <- y[index, 1, drop=FALSE] # 374
    W <- W[index, ] # 374 x 36901
    table(rownames(W) == rownames(y))
## 
## TRUE 
##  374

Population structure

gp <-read.csv("http://ricediversity.org/data/sets/44kgwas/RiceDiversity.44K.germplasm.csv", header = TRUE, skip = 1,  stringsAsFactors = FALSE)   # 431 x 12
    gp2 <- gp[match(rownames(y), gp$NSFTV.ID), ]
    table(rownames(y) == gp2$NSFTV.ID)
## 
## TRUE 
##  374
    plot(gp2$PC1, gp2$PC2, xlab="PC1", ylab="PC2", col=c(1:6)[factor(gp2$Sub.population)])
    legend(x="topleft", legend = levels(factor(gp2$Sub.population)), col=c(1:6), pch=1, cex=0.6)

Quality control Genotype imputation Replace missing marker genotypes with mean values. Then store the marker genotypes in a matrix object W.

for (j in 1:ncol(W)){
    W[,j] <- ifelse(is.na(W[,j]), mean(W[,j], na.rm=TRUE), W[,j])
    }
    sum(W[,1]) # sum of A allele in the first SNP
## [1] 224.6005
    sum(W[,2]) # sum of A allele in the second SNP
## [1] 226.6059
    p <- colSums(W) / (2 * nrow(W)) # or colMean(X) / 2

Minor allele frequency (MAF) Perform a quality control analysis by removing markers with MAF < 0.05. How many markers are removed? Save the filtered genotype matrix in W2.

    maf <- ifelse(p > 0.5, 1-p, p)
    maf.index <- which(maf < 0.05)
    W2 <- W[, -maf.index]  # 374 x 33558 

Genome-wide association studies Fit a single-marker-based linear mixed model by using the GWAS function in the rrBLUP R package. Report the -log10 of p-values for SNP effects. install.packages(“rrBLUP”)

library(rrBLUP)
## Warning: package 'rrBLUP' was built under R version 3.3.3
    map <- read.table("C:/Users/wasim/Desktop/Lectures/Practical/RiceDiversity.44K.MSU6.Genotypes_PLINK/RiceDiversity_44K_Genotypes_PLINK/sativas413.map", header = FALSE, stringsAsFactors = FALSE)
    my.geno <- data.frame(marker=map[,2], chrom=map[,1], pos=map[,4], t(W-1), check.names = FALSE) # W = \in{-1, 0, 1}
    my.pheno <- data.frame(NSFTV_ID=rownames(y), y=y) 

    rel <- GWAS(my.pheno, my.geno, min.MAF=0.05, P3D=TRUE, plot=FALSE)
## [1] "GWAS for trait: y"
## [1] "Variance components estimated. Testing markers."

Manhattan plot install.packages(“qqman”)

library(qqman)
manhattan(x = rel, chr = "chrom", bp = "pos", p = "y", snp = "marker", col = c("blue4", "orange3"), logp = FALSE)

END