1 One gene locus with two alleles, equal frequencies - Simulated data

1.1 Create dataset

We consider one gene locus with two alleles (A and a). We will silumate datasets with genotypes as a combination of these two alleles.

# R
Alleles <- c("A", "a") # alleles
p       <- 0.5         # frequency for allele A or A1
q       <- 1-p         # frequency for allele a or A2
fpq     <- c(p,q)      # combine both frequency 
Nind    <- 100         # number of individuals in a population

We create a random population:

# R
pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2))
pop

##        [,1] [,2]
##   [1,] "a"  "A" 
##   [2,] "a"  "A" 
##   [3,] "A"  "A" 
##   [4,] "a"  "a" 
##   [5,] "A"  "a" 
##   [6,] "A"  "a" 
##   [7,] "A"  "a" 
##   [8,] "a"  "a" 
##   [9,] "a"  "A" 
##  [10,] "A"  "A" 
##  [11,] "a"  "A" 
##  [12,] "a"  "A" 
##  [13,] "A"  "a" 
##  [14,] "a"  "A" 
##  [15,] "a"  "a" 
##  [16,] "A"  "A" 
##  [17,] "A"  "a" 
##  [18,] "a"  "a" 
##  [19,] "a"  "a" 
##  [20,] "A"  "a" 
##  [21,] "A"  "A" 
##  [22,] "a"  "A" 
##  [23,] "A"  "a" 
##  [24,] "a"  "a" 
##  [25,] "A"  "A" 
##  [26,] "a"  "A" 
##  [27,] "a"  "A" 
##  [28,] "A"  "a" 
##  [29,] "a"  "A" 
##  [30,] "a"  "A" 
##  [31,] "A"  "a" 
##  [32,] "A"  "a" 
##  [33,] "A"  "A" 
##  [34,] "a"  "a" 
##  [35,] "A"  "A" 
##  [36,] "A"  "a" 
##  [37,] "A"  "a" 
##  [38,] "A"  "a" 
##  [39,] "A"  "a" 
##  [40,] "A"  "a" 
##  [41,] "A"  "a" 
##  [42,] "A"  "A" 
##  [43,] "A"  "A" 
##  [44,] "a"  "a" 
##  [45,] "a"  "a" 
##  [46,] "a"  "A" 
##  [47,] "a"  "A" 
##  [48,] "a"  "a" 
##  [49,] "A"  "A" 
##  [50,] "A"  "a" 
##  [51,] "a"  "a" 
##  [52,] "a"  "a" 
##  [53,] "A"  "A" 
##  [54,] "a"  "a" 
##  [55,] "A"  "a" 
##  [56,] "a"  "a" 
##  [57,] "a"  "A" 
##  [58,] "a"  "A" 
##  [59,] "A"  "a" 
##  [60,] "A"  "A" 
##  [61,] "A"  "A" 
##  [62,] "A"  "A" 
##  [63,] "a"  "a" 
##  [64,] "A"  "a" 
##  [65,] "A"  "a" 
##  [66,] "a"  "a" 
##  [67,] "A"  "a" 
##  [68,] "a"  "a" 
##  [69,] "a"  "a" 
##  [70,] "a"  "a" 
##  [71,] "a"  "A" 
##  [72,] "A"  "A" 
##  [73,] "a"  "A" 
##  [74,] "A"  "a" 
##  [75,] "A"  "a" 
##  [76,] "a"  "A" 
##  [77,] "A"  "a" 
##  [78,] "a"  "a" 
##  [79,] "A"  "a" 
##  [80,] "A"  "a" 
##  [81,] "a"  "A" 
##  [82,] "a"  "A" 
##  [83,] "a"  "A" 
##  [84,] "a"  "A" 
##  [85,] "a"  "A" 
##  [86,] "a"  "A" 
##  [87,] "a"  "A" 
##  [88,] "a"  "A" 
##  [89,] "a"  "A" 
##  [90,] "a"  "a" 
##  [91,] "a"  "a" 
##  [92,] "a"  "a" 
##  [93,] "A"  "a" 
##  [94,] "A"  "a" 
##  [95,] "a"  "a" 
##  [96,] "A"  "a" 
##  [97,] "A"  "a" 
##  [98,] "a"  "A" 
##  [99,] "a"  "a" 
## [100,] "a"  "A"

We can create a genotype matrix for analysis

# R
x<-paste(pop[,1], pop[,2], sep = "/")
x

##   [1] "a/A" "a/A" "A/A" "a/a" "A/a" "A/a" "A/a" "a/a" "a/A" "A/A" "a/A" "a/A"
##  [13] "A/a" "a/A" "a/a" "A/A" "A/a" "a/a" "a/a" "A/a" "A/A" "a/A" "A/a" "a/a"
##  [25] "A/A" "a/A" "a/A" "A/a" "a/A" "a/A" "A/a" "A/a" "A/A" "a/a" "A/A" "A/a"
##  [37] "A/a" "A/a" "A/a" "A/a" "A/a" "A/A" "A/A" "a/a" "a/a" "a/A" "a/A" "a/a"
##  [49] "A/A" "A/a" "a/a" "a/a" "A/A" "a/a" "A/a" "a/a" "a/A" "a/A" "A/a" "A/A"
##  [61] "A/A" "A/A" "a/a" "A/a" "A/a" "a/a" "A/a" "a/a" "a/a" "a/a" "a/A" "A/A"
##  [73] "a/A" "A/a" "A/a" "a/A" "A/a" "a/a" "A/a" "A/a" "a/A" "a/A" "a/A" "a/A"
##  [85] "a/A" "a/A" "a/A" "a/A" "a/A" "a/a" "a/a" "a/a" "A/a" "A/a" "a/a" "A/a"
##  [97] "A/a" "a/A" "a/a" "a/A"

x1<-as.matrix(x)
x1

##        [,1] 
##   [1,] "a/A"
##   [2,] "a/A"
##   [3,] "A/A"
##   [4,] "a/a"
##   [5,] "A/a"
##   [6,] "A/a"
##   [7,] "A/a"
##   [8,] "a/a"
##   [9,] "a/A"
##  [10,] "A/A"
##  [11,] "a/A"
##  [12,] "a/A"
##  [13,] "A/a"
##  [14,] "a/A"
##  [15,] "a/a"
##  [16,] "A/A"
##  [17,] "A/a"
##  [18,] "a/a"
##  [19,] "a/a"
##  [20,] "A/a"
##  [21,] "A/A"
##  [22,] "a/A"
##  [23,] "A/a"
##  [24,] "a/a"
##  [25,] "A/A"
##  [26,] "a/A"
##  [27,] "a/A"
##  [28,] "A/a"
##  [29,] "a/A"
##  [30,] "a/A"
##  [31,] "A/a"
##  [32,] "A/a"
##  [33,] "A/A"
##  [34,] "a/a"
##  [35,] "A/A"
##  [36,] "A/a"
##  [37,] "A/a"
##  [38,] "A/a"
##  [39,] "A/a"
##  [40,] "A/a"
##  [41,] "A/a"
##  [42,] "A/A"
##  [43,] "A/A"
##  [44,] "a/a"
##  [45,] "a/a"
##  [46,] "a/A"
##  [47,] "a/A"
##  [48,] "a/a"
##  [49,] "A/A"
##  [50,] "A/a"
##  [51,] "a/a"
##  [52,] "a/a"
##  [53,] "A/A"
##  [54,] "a/a"
##  [55,] "A/a"
##  [56,] "a/a"
##  [57,] "a/A"
##  [58,] "a/A"
##  [59,] "A/a"
##  [60,] "A/A"
##  [61,] "A/A"
##  [62,] "A/A"
##  [63,] "a/a"
##  [64,] "A/a"
##  [65,] "A/a"
##  [66,] "a/a"
##  [67,] "A/a"
##  [68,] "a/a"
##  [69,] "a/a"
##  [70,] "a/a"
##  [71,] "a/A"
##  [72,] "A/A"
##  [73,] "a/A"
##  [74,] "A/a"
##  [75,] "A/a"
##  [76,] "a/A"
##  [77,] "A/a"
##  [78,] "a/a"
##  [79,] "A/a"
##  [80,] "A/a"
##  [81,] "a/A"
##  [82,] "a/A"
##  [83,] "a/A"
##  [84,] "a/A"
##  [85,] "a/A"
##  [86,] "a/A"
##  [87,] "a/A"
##  [88,] "a/A"
##  [89,] "a/A"
##  [90,] "a/a"
##  [91,] "a/a"
##  [92,] "a/a"
##  [93,] "A/a"
##  [94,] "A/a"
##  [95,] "a/a"
##  [96,] "A/a"
##  [97,] "A/a"
##  [98,] "a/A"
##  [99,] "a/a"
## [100,] "a/A"

1.2 Allele and genotype frequencies

Absolute and relative frequencies of genotypes

# R
t<-table(x1)
t

## x1
## a/a a/A A/a A/A 
##  25  29  31  15

P<-t[4]/Nind
Q<-t[1]/Nind
H<-(t[2]+t[3])/Nind

Table of genotype frequencies

# R
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("pop")
genotypes

##        P   H    Q
## pop 0.15 0.6 0.25

A plot of genotype frequencies

# R
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)

Allele frequencies

# R
p1<-P+H/2
q1<-Q+H/2

Table of allele frequencies

# R
alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE)
colnames(alleles) = c("p", "q")
rownames(alleles) <- c("pop")
alleles

##        p    q
## pop 0.45 0.55

A plot of allele frequencies

# R
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)

1.3 Metrics of genetic diversity

Observed and expected heterozygosity

# R
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He

A table comparing Ho and He

# R
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
colnames(index) = c("Ho", "He", "F")
rownames(index) <- c("pop")
index

##      Ho    He          F
## pop 0.6 0.495 -0.2121212

A plot comparing Ho and He

# R
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

A final table summarizing everything

# R
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table

##       N    p    q  Ho    He          F
## pop 100 0.45 0.55 0.6 0.495 -0.2121212

1.4 Create a function that does all this

# R
diversity <- function(Alleles, p=0.5, Nind=100) {
q       <- 1-p         # frequency for allele a or A2
fpq     <- c(p,q)      # combine both frequency 
pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2))
x<-paste(pop[,1], pop[,2], sep = "/")
x1<<-as.matrix(x) # Since we need x1 for further analysis outside the function, we define it with <<-
t<-table(x1)
P<-t[4]/Nind
Q<-t[1]/Nind
H<-(t[2]+t[3])/Nind
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("pop")
p1<-P+H/2
q1<-Q+H/2
alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE)
colnames(alleles) = c("p", "q")
rownames(alleles) <- c("pop")
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
colnames(index) = c("Ho", "He", "F")
rownames(index) <- c("pop")
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
print(h_table)
# A multiple plot
par(mfrow=c(1,3))
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
}

Now lets try different allele frequencies and population sizes. What do you observe?

# R
diversity(Alleles,0.5,100)

##       N    p    q   Ho     He           F
## pop 100 0.47 0.53 0.52 0.4982 -0.04375753

diversity(Alleles,0.2,10000)

##         N      p      q    Ho        He          F
## pop 10000 0.1972 0.8028 0.322 0.3166243 -0.0169781

diversity(Alleles,0.1,10000)

##         N      p      q     Ho      He           F
## pop 10000 0.1001 0.8999 0.1794 0.18016 0.004218362

diversity(Alleles,0.2,10)

##      N  p   q  Ho He  F
## pop 10 NA 0.8 0.4 NA NA

2 Real Data

2.1 Using our new function to measure genetic diversity in a real case: Hop population in Central Greece

Import data file with 9 SNPs

# R
hop<-read.csv("hop.csv", header=TRUE)

We create a genotype matrix for analysis for SNP_1

# R
hop1<-hop[,2]
y1<-as.matrix(hop1)
Nind <- length(y1)         # number of individuals in a population

Allele and genotype frequencies

# R
z<-table(y1)
z

## y1
## CC CT TT 
## 16 35 18

P<-z[1]/Nind
Q<-z[3]/Nind
H<-z[2]/Nind
p1<-P+H/2
q1<-Q+H/2

Table of genotype frequencies

# R
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("hop")
genotypes

##             P         H         Q
## hop 0.2318841 0.5072464 0.2608696

Some more metrics

# R
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He

A final table summarizing everything

# R
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table

##      N         p         q        Ho        He           F
## pop 69 0.4855072 0.5144928 0.5072464 0.4995799 -0.01534581

A combined plot

# R
par(mfrow=c(1,3))
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

Can you repeat this analysis for the second SNP?

2.2 Simulated data: Analysis using adegenet

Let’s create a datafile using our function

# R
diversity(Alleles,0.2,10000)

##         N      p      q     Ho        He           F
## pop 10000 0.2036 0.7964 0.3312 0.3242941 -0.02129524

Call the adegenet library

# R
library(adegenet)

## Warning: package 'adegenet' was built under R version 4.3.0

## Warning: package 'ade4' was built under R version 4.3.0

Create a geneind object

# R
x2<-df2genind(x1, sep="/")
x2

## /// GENIND OBJECT /////////
## 
##  // 10,000 individuals; 1 locus; 2 alleles; size: 746.6 Kb
## 
##  // Basic content
##    @tab:  10000 x 2 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 2 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: df2genind(X = x1, sep = "/")
## 
##  // Optional content
##    - empty -

x3<-summary(x2)
x3

## 
## // Number of individuals: 10000
## // Group sizes: 10000
## // Number of alleles per locus: 2
## // Number of alleles per group: 2
## // Percentage of missing data: 0 %
## // Observed heterozygosity: 0.33
## // Expected heterozygosity: 0.32

Some metrics

# R
Ho <- mean(x3$Hobs)
He <- mean(x3$Hexp)
F <- (He-Ho)/He
Na <- mean(x2$loc.n.all)

A final table summarizing everything

# R
h_table = matrix(c(Na, Ho, He, F), ncol=4, byrow=TRUE)
colnames(h_table) = c("Na", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table

##     Na     Ho        He           F
## pop  2 0.3312 0.3242941 -0.02129524

A plot comparing Ho and He

# R
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

3 More Loci and more populations: diversity and differentiation

3.1 Pinus heldreichii in North Greece

Get the data

# R
info<-read.csv("robolo1.csv")

number of individuals

# R
nind=nrow(info)

number of loci, by removing columns of info

# R
nloci=ncol(info)-5

make genind object

# R
obj<-df2genind(info[,c(6:19)], sep="/", NA.char = "NA")

add geographic coordinates and create a first map

# R
obj@other$xy<-info[,4:5]
plot(obj$other$xy,cex=1.5,xlab='x',ylab='y')
points(obj$other$xy, col="red",pch=20)

add the (sub)population information

# R
pop(obj) <- info[,3]
obj

## /// GENIND OBJECT /////////
## 
##  // 79 individuals; 14 loci; 29 alleles; size: 30.3 Kb
## 
##  // Basic content
##    @tab:  79 x 29 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-3)
##    @loc.fac: locus factor for the 29 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: df2genind(X = info[, c(6:19)], sep = "/", NA.char = "NA")
## 
##  // Optional content
##    @pop: population of each individual (group size range: 4-6)
##    @other: a list containing: xy

Calculate some basic genetic parameters for the whole dataset

# R
sum<-summary(obj)
sum

## 
## // Number of individuals: 79
## // Group sizes: 6 5 6 6 5 6 6 5 6 6 4 6 6 6
## // Number of alleles per locus: 2 2 2 2 3 2 2 2 2 2 2 2 2 2
## // Number of alleles per group: 25 26 25 24 25 25 25 25 28 26 26 26 26 24
## // Percentage of missing data: 0 %
## // Observed heterozygosity: 0.51 0.48 0.25 0.37 0.3 0.03 0.14 0.23 0.58 0.51 0.51 0.1 0.24 0.38
## // Expected heterozygosity: 0.47 0.46 0.26 0.47 0.3 0.05 0.21 0.2 0.5 0.48 0.48 0.14 0.21 0.48

Expected and observed hererozygosity and allelic richness

# R
Ho_all <- mean(sum$Hobs)
He_all <- mean(sum$Hexp)
F_all <- (He_all-Ho_all)/He_all
Na_all <- mean(obj$loc.n.all)

A table of diversity metrics

# R
robolo_table = matrix(c(Na_all, Ho_all, He_all, F_all), ncol=4, byrow=TRUE)
colnames(robolo_table) = c("Na", "Ho", "He", "F")
rownames(robolo_table) <- c("robolo")
robolo_table

##              Na        Ho        He          F
## robolo 2.071429 0.3300181 0.3361755 0.01831614

A plot comparing Ho and He

# R
index = matrix(c(Ho_all, He_all, F_all), ncol=3, byrow=TRUE)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

Create a genpop object

# R
objp <- genind2genpop(obj)

## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

objp

## /// GENPOP OBJECT /////////
## 
##  // 14 populations; 14 loci; 29 alleles; size: 15.3 Kb
## 
##  // Basic content
##    @tab:  14 x 29 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-3)
##    @loc.fac: locus factor for the 29 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: genind2genpop(x = obj)
## 
##  // Optional content
##    @other: a list containing: xy

popNames(objp)

##  [1] "Laista1"    "Laista2"    "Vermio1"    "Vermio2"    "Tymfi1"    
##  [6] "Tymfi2"     "Metsovo1"   "Metsovo2"   "Smolikas1"  "Smolikas2" 
## [11] "Smolikas3"  "Vasilitsa1" "Vasilitsa2" "Vasilitsa3"

Make a plot showing the number of genotypes per population

# R
barplot(sum$n.by.pop, main="Sample sizes per population",
ylab="Number of genotypes",las=3)

Genetic distances (Nei) between populations. We will need the package hierfstat

# R
library(hierfstat)
nei <- genet.dist(obj, method = "Dm")
nei

##                Laista1     Laista2     Vermio1     Vermio2      Tymfi1
## Laista2    0.025793651                                                
## Vermio1    0.035714286 0.018253968                                    
## Vermio2    0.103670635 0.072023810 0.032242063                        
## Tymfi1     0.062698413 0.013571429 0.036111111 0.089880952            
## Tymfi2     0.021329365 0.040773810 0.059027778 0.142361111 0.065773810
## Metsovo1   0.030257937 0.022519841 0.051091270 0.138888889 0.052281746
## Metsovo2   0.057460317 0.019285714 0.043968254 0.092976190 0.024285714
## Smolikas1  0.027777778 0.004761905 0.023809524 0.081845238 0.023809524
## Smolikas2  0.018849206 0.004960317 0.028769841 0.093253968 0.021626984
## Smolikas3  0.053695437 0.039508929 0.028893849 0.062872024 0.050223214
## Vasilitsa1 0.053571429 0.026388889 0.018849206 0.049107143 0.050198413
## Vasilitsa2 0.037202381 0.015972222 0.036210317 0.090277778 0.050496032
## Vasilitsa3 0.037202381 0.024305556 0.031250000 0.055059524 0.061210317
##                 Tymfi2    Metsovo1    Metsovo2   Smolikas1   Smolikas2
## Laista2                                                               
## Vermio1                                                               
## Vermio2                                                               
## Tymfi1                                                                
## Tymfi2                                                                
## Metsovo1   0.034722222                                                
## Metsovo2   0.041488095 0.035138889                                    
## Smolikas1  0.049107143 0.020337302 0.030476190                        
## Smolikas2  0.023313492 0.022321429 0.018769841 0.008928571            
## Smolikas3  0.087425595 0.078993056 0.090223214 0.039806548 0.053695437
## Vasilitsa1 0.077876984 0.050099206 0.060436508 0.027777778 0.042658730
## Vasilitsa2 0.057539683 0.016865079 0.036924603 0.013392857 0.022321429
## Vasilitsa3 0.065476190 0.053571429 0.048829365 0.026289683 0.029265873
##              Smolikas3  Vasilitsa1  Vasilitsa2
## Laista2                                       
## Vermio1                                       
## Vermio2                                       
## Tymfi1                                        
## Tymfi2                                        
## Metsovo1                                      
## Metsovo2                                      
## Smolikas1                                     
## Smolikas2                                     
## Smolikas3                                     
## Vasilitsa1 0.023933532                        
## Vasilitsa2 0.059647817 0.025297619            
## Vasilitsa3 0.052207341 0.028273810 0.018849206

Constructing a neighbor joining tree using ape

# R
library(ape)
tree <- nj(nei)
plot(tree)

4 Creating a phylogenetic tree among species using homologous genes

4.1 The cytochrome C oxidase I (COI) gene in hominids

We will need the packages msa and seqinr

# R
library(seqinr)
library(msa)

## Warning: package 'msa' was built under R version 4.3.0

## Warning: package 'Biostrings' was built under R version 4.3.0

## Warning: package 'BiocGenerics' was built under R version 4.3.0

## Warning: package 'S4Vectors' was built under R version 4.3.0

## Warning: package 'IRanges' was built under R version 4.3.0

## Warning: package 'XVector' was built under R version 4.3.0

## Warning: package 'GenomeInfoDb' was built under R version 4.3.0

Download the “all.fasta.txt” file

# R
dna<-readDNAStringSet("all.fasta.txt")
dna

## DNAStringSet object of length 6:
##     width seq                                               names               
## [1]  1542 ATGTTCACCGACCGCTGACTATT...AACCCGTGTACATAAAATCTAGA Pan_paniscus
## [2]  1542 ATGTTCACCGACCGCTGACTATT...AACCCGTATACATAAAATCTAGA Pan_troglodytes
## [3]  1542 ATGTTCGCCGACCGTTGACTATT...AACCCGTATACATAAAATCTAGA Homo_denisovan
## [4]  1542 ATGTTCGCCGACCGTTGACTATT...AGCCCGTATACATAAAATCTAGA Homo_neanderthale...
## [5]  1542 ATGTTCGCCGACCGTTGACTATT...AACCCGTATACATAAAATCTAGA Homo_sapiens
## [6]  1545 ATGTTCGTAAATCGTTGACTATA...CAACATATATCAACCTAAAATAA Sus_scrofa

We align the sequences using msa (multiple sequence alignment)

# R
zz <- msa(dna)

## use default substitution matrix

zz

## CLUSTAL 2.1  
## 
## Call:
##    msa(dna)
## 
## MsaDNAMultipleAlignment with 6 rows and 1545 columns
##     aln                                                    names
## [1] ATGTTCACCGACCGCTGACTATTCTC...ACCCGTGTACATAAAATCTAGA--- Pan_paniscus
## [2] ATGTTCACCGACCGCTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Pan_troglodytes
## [3] ATGTTCGCCGACCGTTGACTATTCTC...GCCCGTATACATAAAATCTAGA--- Homo_neanderthale...
## [4] ATGTTCGCCGACCGTTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Homo_sapiens
## [5] ATGTTCGCCGACCGTTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Homo_denisovan
## [6] ATGTTCGTAAATCGTTGACTATACTC...ACCAACATATATCAACCTAAAATAA Sus_scrofa
## Con ATGTTCGCCGACCGTTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Consensus

We turn zz into an “align” object to go on finding the genetic distances using the “identitiy” method

# R
align <- msaConvert(zz, type="seqinr::alignment")
d <- dist.alignment(align, "identity")
d

##                       Pan_paniscus Pan_troglodytes Homo_neanderthalensis
## Pan_troglodytes          0.1972575                                      
## Homo_neanderthalensis    0.3023900       0.2947883                      
## Homo_sapiens             0.3045270       0.2947883             0.1018633
## Homo_denisovan           0.3098052       0.3002377             0.1549026
## Sus_scrofa               0.4647078       0.4605022             0.4569680
##                       Homo_sapiens Homo_denisovan
## Pan_troglodytes                                  
## Homo_neanderthalensis                            
## Homo_sapiens                                     
## Homo_denisovan           0.1462900               
## Sus_scrofa               0.4597975      0.4590918

Create the nj tree. Is it correct?

# R
tree1 <- nj(d)
plot(tree1)

We need to root the tree using the wild boar as an outgroup. Now the tree is correct!

# R
rtree<-root(tree1, "Sus_scrofa")
plot(rtree)

Genetic analysis of populations using R

Aristotelis C. Papageorgiou, apapage@mbg.duth.gr

2024-11-27

1 One gene locus with two alleles, equal frequencies - Simulated data

1.1 Create dataset

1.2 Allele and genotype frequencies

1.3 Metrics of genetic diversity

1.4 Create a function that does all this

2 Real Data

2.1 Using our new function to measure genetic diversity in a real case: Hop population in Central Greece

2.2 Simulated data: Analysis using adegenet

3 More Loci and more populations: diversity and differentiation

3.1 Pinus heldreichii in North Greece

4 Creating a phylogenetic tree among species using homologous genes

4.1 The cytochrome C oxidase I (COI) gene in hominids