#### Population and Evolution Genetics - Genetic analysis of populations using R #### ### One gene locus with two alleles, equal frequencies ## Simulated data Alleles <- c("A", "a") # alleles p <- 0.5 # frequency for allele A or A1 q <- 1-p # frequency for allele a or A2 fpq <- c(p,q) # combine both frequency Nind <- 100 # number of individuals in a population # We create our first random population: pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2)) pop # We can create a genotype matrix for analysis x<-paste(pop[,1], pop[,2], sep = "/") x x1<-as.matrix(x) x1 # Allele and genotype frequencies t<-table(x1) t P<-t[4]/Nind Q<-t[1]/Nind H<-(t[2]+t[3])/Nind # Table of genotype frequencies genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE) colnames(genotypes) = c("P", "H", "Q") rownames(genotypes) <- c("pop") genotypes # A plot of genotype frequencies barplot(genotypes) # Put some colour in your life barplot(genotypes, main = "Genotype Frequencies", ylab = "Frequency", ylim = c(0,1), border="red", col="blue", density=10) p1<-P+H/2 q1<-Q+H/2 # Table of allele frequencies alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE) colnames(alleles) = c("p", "q") rownames(alleles) <- c("pop") alleles # A plot of allele frequencies barplot(alleles, main = "Allele Frequencies", ylab = "Frequency", ylim = c(0,1), border="blue", col="red", density=10) # Some more metrics Ho <- unname(H) He <- 1-((p1^2)+(q1^2)) F <- (He-Ho)/He # A table comparing Ho and He index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE) colnames(index) = c("Ho", "He", "F") rownames(index) <- c("pop") index # A plot comparing Ho and He fancy<-c("darkred", "darkgreen") barplot(index[,1:2], main = "Heterozygosity (observed/expected)", ylim = c(0,1), col=fancy, density=70) # A final table summarizing everything h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE) colnames(h_table) = c("N", "p", "q", "Ho", "He", "F") rownames(h_table) <- c("pop") h_table # Create a function that does all this diversity <- function(Alleles, p=0.5, Nind=100) { q <- 1-p # frequency for allele a or A2 fpq <- c(p,q) # combine both frequency pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2)) x<-paste(pop[,1], pop[,2], sep = "/") x1<<-as.matrix(x) # Since we need x1 for further analysis outside the function, we define it with <<- t<-table(x1) P<-t[4]/Nind Q<-t[1]/Nind H<-(t[2]+t[3])/Nind genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE) colnames(genotypes) = c("P", "H", "Q") rownames(genotypes) <- c("pop") p1<-P+H/2 q1<-Q+H/2 alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE) colnames(alleles) = c("p", "q") rownames(alleles) <- c("pop") Ho <- unname(H) He <- 1-((p1^2)+(q1^2)) F <- (He-Ho)/He index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE) colnames(index) = c("Ho", "He", "F") rownames(index) <- c("pop") h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE) colnames(h_table) = c("N", "p", "q", "Ho", "He", "F") rownames(h_table) <- c("pop") print(h_table) # A multiple plot par(mfrow=c(1,3)) barplot(genotypes, main = "Genotype Frequencies", ylab = "Frequency", ylim = c(0,1), border="red", col="blue", density=10) barplot(alleles, main = "Allele Frequencies", ylab = "Frequency", ylim = c(0,1), border="blue", col="red", density=10) fancy<-c("darkred", "darkgreen") barplot(index[,1:2], main = "Heterozygosity (observed/expected)", ylim = c(0,1), col=fancy, density=70) } # Now lets try different allele frequencies and population sizes. What do you observe? diversity(Alleles,0.5,100) diversity(Alleles,0.2,10000) diversity(Alleles,0.1,10000) diversity(Alleles,0.2,10) ## Real Data: Using our new function to measure genetic diversity in a real case: Hop population in Central Greece # import data file with 9 SNPs hop<-read.csv("hop.csv", header=TRUE) # We create a genotype matrix for analysis for SNP_1 hop1<-hop[,2] y1<-as.matrix(hop1) Nind <- length(y1) # number of individuals in a population # Allele and genotype frequencies z<-table(y1) z P<-z[1]/Nind Q<-z[3]/Nind H<-z[2]/Nind p1<-P+H/2 q1<-Q+H/2 # Table of genotype frequencies genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE) colnames(genotypes) = c("P", "H", "Q") rownames(genotypes) <- c("hop") genotypes # Some more metrics Ho <- unname(H) He <- 1-((p1^2)+(q1^2)) F <- (He-Ho)/He # A final table summarizing everything h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE) colnames(h_table) = c("N", "p", "q", "Ho", "He", "F") rownames(h_table) <- c("pop") h_table # A combined plot par(mfrow=c(1,3)) barplot(genotypes, main = "Genotype Frequencies", ylab = "Frequency", ylim = c(0,1), border="red", col="blue", density=10) barplot(alleles, main = "Allele Frequencies", ylab = "Frequency", ylim = c(0,1), border="blue", col="red", density=10) fancy<-c("darkred", "darkgreen") barplot(index[,1:2], main = "Heterozygosity (observed/expected)", ylim = c(0,1), col=fancy, density=70) # Can you repeat this analysis for the second SNP? ## Simulated data: Analysis using adegenet # Let's create a datafile using our function diversity(Alleles,0.2,10000) # Call the adegenet library library(adegenet) # Create a geneind object x2<-df2genind(x1, sep="/") x2 x3<-summary(x2) x3 # Some metrics Ho <- mean(x3$Hobs) He <- mean(x3$Hexp) F <- (He-Ho)/He Na <- mean(x2$loc.n.all) # A final table summarizing everything h_table = matrix(c(Na, Ho, He, F), ncol=4, byrow=TRUE) colnames(h_table) = c("Na", "Ho", "He", "F") rownames(h_table) <- c("pop") h_table # A plot comparing Ho and He index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE) fancy<-c("darkred", "darkgreen") barplot(index[,1:2], main = "Heterozygosity (observed/expected)", ylim = c(0,1), col=fancy, density=70) ### More Loci and more populations: diversity and differentiation ## Real data: Pinus heldreichii in North Greece # Get the data info<-read.csv("robolo1.csv") # number of individuals nind=nrow(info) # number of loci, by removing columns of info nloci=ncol(info)-5 # make genind object obj<-df2genind(info[,c(6:19)], sep="/", NA.char = "NA") # add geographic coordinates and create a first map obj@other$xy<-info[,4:5] plot(obj$other$xy,cex=1.5,xlab='x',ylab='y') points(obj$other$xy, col="red",pch=20) # add the (sub)population information pop(obj) <- info[,3] obj # Calculate some basic genetic parameters for the whole dataset sum<-summary(obj) sum # Expected and observed hererozygosity and allelic richness Ho_all <- mean(sum$Hobs) He_all <- mean(sum$Hexp) F_all <- (He_all-Ho_all)/He_all Na_all <- mean(obj$loc.n.all) # A table of diversity metrics robolo_table = matrix(c(Na_all, Ho_all, He_all, F_all), ncol=4, byrow=TRUE) colnames(robolo_table) = c("Na", "Ho", "He", "F") rownames(robolo_table) <- c("robolo") robolo_table # A plot comparing Ho and He index = matrix(c(Ho_all, He_all, F_all), ncol=3, byrow=TRUE) fancy<-c("darkred", "darkgreen") barplot(index[,1:2], main = "Heterozygosity (observed/expected)", ylim = c(0,1), col=fancy, density=70) # Create a genpop object objp <- genind2genpop(obj) objp popNames(objp) # Make a plot showing the number of genotypes per population barplot(sum$n.by.pop, main="Sample sizes per population", ylab="Number of genotypes",las=3) # Genetic distances (Nei) between populations. We will need the package hierfstat library(hierfstat) nei <- genet.dist(obj, method = "Dm") nei # Constructing a neighbor joining tree using ape library(ape) tree <- nj(nei) ### Creating a phylogenetic tree among species using homologous genes ## The cytochrome C oxidase I (COI) gene in hominids # We will need the packages msa and seqinr library(seqinr) library(msa) # Download the "all.fasta.txt" file dna<-readDNAStringSet("all.fasta.txt") dna # we align the sequences using msa (multiple sequence alignment) zz <- msa(dna) zz # We turn zz into an "align" object to go on finding the genetic distances using the "identitiy" method align <- msaConvert(zz, type="seqinr::alignment") d <- dist.alignment(align, "identity") d # Create the nj tree. Is it correct? tree1 <- nj(d) plot(tree1) # We need to root the tree using the wild boar as an outgroup. Now the tree is correct! rtree<-root(tree1, "Sus_scrofa") plot(rtree)