We consider one gene locus with two alleles (A and a). We will silumate datasets with genotypes as a combination of these two alleles.
# R
Alleles <- c("A", "a") # alleles
p <- 0.5 # frequency for allele A or A1
q <- 1-p # frequency for allele a or A2
fpq <- c(p,q) # combine both frequency
Nind <- 100 # number of individuals in a population
We create a random population:
# R
pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2))
pop
## [,1] [,2]
## [1,] "a" "A"
## [2,] "a" "A"
## [3,] "A" "A"
## [4,] "a" "a"
## [5,] "A" "a"
## [6,] "A" "a"
## [7,] "A" "a"
## [8,] "a" "a"
## [9,] "a" "A"
## [10,] "A" "A"
## [11,] "a" "A"
## [12,] "a" "A"
## [13,] "A" "a"
## [14,] "a" "A"
## [15,] "a" "a"
## [16,] "A" "A"
## [17,] "A" "a"
## [18,] "a" "a"
## [19,] "a" "a"
## [20,] "A" "a"
## [21,] "A" "A"
## [22,] "a" "A"
## [23,] "A" "a"
## [24,] "a" "a"
## [25,] "A" "A"
## [26,] "a" "A"
## [27,] "a" "A"
## [28,] "A" "a"
## [29,] "a" "A"
## [30,] "a" "A"
## [31,] "A" "a"
## [32,] "A" "a"
## [33,] "A" "A"
## [34,] "a" "a"
## [35,] "A" "A"
## [36,] "A" "a"
## [37,] "A" "a"
## [38,] "A" "a"
## [39,] "A" "a"
## [40,] "A" "a"
## [41,] "A" "a"
## [42,] "A" "A"
## [43,] "A" "A"
## [44,] "a" "a"
## [45,] "a" "a"
## [46,] "a" "A"
## [47,] "a" "A"
## [48,] "a" "a"
## [49,] "A" "A"
## [50,] "A" "a"
## [51,] "a" "a"
## [52,] "a" "a"
## [53,] "A" "A"
## [54,] "a" "a"
## [55,] "A" "a"
## [56,] "a" "a"
## [57,] "a" "A"
## [58,] "a" "A"
## [59,] "A" "a"
## [60,] "A" "A"
## [61,] "A" "A"
## [62,] "A" "A"
## [63,] "a" "a"
## [64,] "A" "a"
## [65,] "A" "a"
## [66,] "a" "a"
## [67,] "A" "a"
## [68,] "a" "a"
## [69,] "a" "a"
## [70,] "a" "a"
## [71,] "a" "A"
## [72,] "A" "A"
## [73,] "a" "A"
## [74,] "A" "a"
## [75,] "A" "a"
## [76,] "a" "A"
## [77,] "A" "a"
## [78,] "a" "a"
## [79,] "A" "a"
## [80,] "A" "a"
## [81,] "a" "A"
## [82,] "a" "A"
## [83,] "a" "A"
## [84,] "a" "A"
## [85,] "a" "A"
## [86,] "a" "A"
## [87,] "a" "A"
## [88,] "a" "A"
## [89,] "a" "A"
## [90,] "a" "a"
## [91,] "a" "a"
## [92,] "a" "a"
## [93,] "A" "a"
## [94,] "A" "a"
## [95,] "a" "a"
## [96,] "A" "a"
## [97,] "A" "a"
## [98,] "a" "A"
## [99,] "a" "a"
## [100,] "a" "A"
We can create a genotype matrix for analysis
# R
x<-paste(pop[,1], pop[,2], sep = "/")
x
## [1] "a/A" "a/A" "A/A" "a/a" "A/a" "A/a" "A/a" "a/a" "a/A" "A/A" "a/A" "a/A"
## [13] "A/a" "a/A" "a/a" "A/A" "A/a" "a/a" "a/a" "A/a" "A/A" "a/A" "A/a" "a/a"
## [25] "A/A" "a/A" "a/A" "A/a" "a/A" "a/A" "A/a" "A/a" "A/A" "a/a" "A/A" "A/a"
## [37] "A/a" "A/a" "A/a" "A/a" "A/a" "A/A" "A/A" "a/a" "a/a" "a/A" "a/A" "a/a"
## [49] "A/A" "A/a" "a/a" "a/a" "A/A" "a/a" "A/a" "a/a" "a/A" "a/A" "A/a" "A/A"
## [61] "A/A" "A/A" "a/a" "A/a" "A/a" "a/a" "A/a" "a/a" "a/a" "a/a" "a/A" "A/A"
## [73] "a/A" "A/a" "A/a" "a/A" "A/a" "a/a" "A/a" "A/a" "a/A" "a/A" "a/A" "a/A"
## [85] "a/A" "a/A" "a/A" "a/A" "a/A" "a/a" "a/a" "a/a" "A/a" "A/a" "a/a" "A/a"
## [97] "A/a" "a/A" "a/a" "a/A"
x1<-as.matrix(x)
x1
## [,1]
## [1,] "a/A"
## [2,] "a/A"
## [3,] "A/A"
## [4,] "a/a"
## [5,] "A/a"
## [6,] "A/a"
## [7,] "A/a"
## [8,] "a/a"
## [9,] "a/A"
## [10,] "A/A"
## [11,] "a/A"
## [12,] "a/A"
## [13,] "A/a"
## [14,] "a/A"
## [15,] "a/a"
## [16,] "A/A"
## [17,] "A/a"
## [18,] "a/a"
## [19,] "a/a"
## [20,] "A/a"
## [21,] "A/A"
## [22,] "a/A"
## [23,] "A/a"
## [24,] "a/a"
## [25,] "A/A"
## [26,] "a/A"
## [27,] "a/A"
## [28,] "A/a"
## [29,] "a/A"
## [30,] "a/A"
## [31,] "A/a"
## [32,] "A/a"
## [33,] "A/A"
## [34,] "a/a"
## [35,] "A/A"
## [36,] "A/a"
## [37,] "A/a"
## [38,] "A/a"
## [39,] "A/a"
## [40,] "A/a"
## [41,] "A/a"
## [42,] "A/A"
## [43,] "A/A"
## [44,] "a/a"
## [45,] "a/a"
## [46,] "a/A"
## [47,] "a/A"
## [48,] "a/a"
## [49,] "A/A"
## [50,] "A/a"
## [51,] "a/a"
## [52,] "a/a"
## [53,] "A/A"
## [54,] "a/a"
## [55,] "A/a"
## [56,] "a/a"
## [57,] "a/A"
## [58,] "a/A"
## [59,] "A/a"
## [60,] "A/A"
## [61,] "A/A"
## [62,] "A/A"
## [63,] "a/a"
## [64,] "A/a"
## [65,] "A/a"
## [66,] "a/a"
## [67,] "A/a"
## [68,] "a/a"
## [69,] "a/a"
## [70,] "a/a"
## [71,] "a/A"
## [72,] "A/A"
## [73,] "a/A"
## [74,] "A/a"
## [75,] "A/a"
## [76,] "a/A"
## [77,] "A/a"
## [78,] "a/a"
## [79,] "A/a"
## [80,] "A/a"
## [81,] "a/A"
## [82,] "a/A"
## [83,] "a/A"
## [84,] "a/A"
## [85,] "a/A"
## [86,] "a/A"
## [87,] "a/A"
## [88,] "a/A"
## [89,] "a/A"
## [90,] "a/a"
## [91,] "a/a"
## [92,] "a/a"
## [93,] "A/a"
## [94,] "A/a"
## [95,] "a/a"
## [96,] "A/a"
## [97,] "A/a"
## [98,] "a/A"
## [99,] "a/a"
## [100,] "a/A"
Absolute and relative frequencies of genotypes
# R
t<-table(x1)
t
## x1
## a/a a/A A/a A/A
## 25 29 31 15
P<-t[4]/Nind
Q<-t[1]/Nind
H<-(t[2]+t[3])/Nind
Table of genotype frequencies
# R
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("pop")
genotypes
## P H Q
## pop 0.15 0.6 0.25
A plot of genotype frequencies
# R
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
Allele frequencies
# R
p1<-P+H/2
q1<-Q+H/2
Table of allele frequencies
# R
alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE)
colnames(alleles) = c("p", "q")
rownames(alleles) <- c("pop")
alleles
## p q
## pop 0.45 0.55
A plot of allele frequencies
# R
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
Observed and expected heterozygosity
# R
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He
A table comparing Ho and He
# R
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
colnames(index) = c("Ho", "He", "F")
rownames(index) <- c("pop")
index
## Ho He F
## pop 0.6 0.495 -0.2121212
A plot comparing Ho and He
# R
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
A final table summarizing everything
# R
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table
## N p q Ho He F
## pop 100 0.45 0.55 0.6 0.495 -0.2121212
# R
diversity <- function(Alleles, p=0.5, Nind=100) {
q <- 1-p # frequency for allele a or A2
fpq <- c(p,q) # combine both frequency
pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2))
x<-paste(pop[,1], pop[,2], sep = "/")
x1<<-as.matrix(x) # Since we need x1 for further analysis outside the function, we define it with <<-
t<-table(x1)
P<-t[4]/Nind
Q<-t[1]/Nind
H<-(t[2]+t[3])/Nind
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("pop")
p1<-P+H/2
q1<-Q+H/2
alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE)
colnames(alleles) = c("p", "q")
rownames(alleles) <- c("pop")
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
colnames(index) = c("Ho", "He", "F")
rownames(index) <- c("pop")
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
print(h_table)
# A multiple plot
par(mfrow=c(1,3))
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
}
Now lets try different allele frequencies and population sizes. What do you observe?
# R
diversity(Alleles,0.5,100)
## N p q Ho He F
## pop 100 0.47 0.53 0.52 0.4982 -0.04375753
diversity(Alleles,0.2,10000)
## N p q Ho He F
## pop 10000 0.1972 0.8028 0.322 0.3166243 -0.0169781
diversity(Alleles,0.1,10000)
## N p q Ho He F
## pop 10000 0.1001 0.8999 0.1794 0.18016 0.004218362
diversity(Alleles,0.2,10)
## N p q Ho He F
## pop 10 NA 0.8 0.4 NA NA
Import data file with 9 SNPs
# R
hop<-read.csv("hop.csv", header=TRUE)
We create a genotype matrix for analysis for SNP_1
# R
hop1<-hop[,2]
y1<-as.matrix(hop1)
Nind <- length(y1) # number of individuals in a population
Allele and genotype frequencies
# R
z<-table(y1)
z
## y1
## CC CT TT
## 16 35 18
P<-z[1]/Nind
Q<-z[3]/Nind
H<-z[2]/Nind
p1<-P+H/2
q1<-Q+H/2
Table of genotype frequencies
# R
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("hop")
genotypes
## P H Q
## hop 0.2318841 0.5072464 0.2608696
Some more metrics
# R
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He
A final table summarizing everything
# R
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table
## N p q Ho He F
## pop 69 0.4855072 0.5144928 0.5072464 0.4995799 -0.01534581
A combined plot
# R
par(mfrow=c(1,3))
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
Can you repeat this analysis for the second SNP?
Let’s create a datafile using our function
# R
diversity(Alleles,0.2,10000)
## N p q Ho He F
## pop 10000 0.2036 0.7964 0.3312 0.3242941 -0.02129524
Call the adegenet library
# R
library(adegenet)
## Warning: package 'adegenet' was built under R version 4.3.0
## Warning: package 'ade4' was built under R version 4.3.0
Create a geneind object
# R
x2<-df2genind(x1, sep="/")
x2
## /// GENIND OBJECT /////////
##
## // 10,000 individuals; 1 locus; 2 alleles; size: 746.6 Kb
##
## // Basic content
## @tab: 10000 x 2 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 2 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: df2genind(X = x1, sep = "/")
##
## // Optional content
## - empty -
x3<-summary(x2)
x3
##
## // Number of individuals: 10000
## // Group sizes: 10000
## // Number of alleles per locus: 2
## // Number of alleles per group: 2
## // Percentage of missing data: 0 %
## // Observed heterozygosity: 0.33
## // Expected heterozygosity: 0.32
Some metrics
# R
Ho <- mean(x3$Hobs)
He <- mean(x3$Hexp)
F <- (He-Ho)/He
Na <- mean(x2$loc.n.all)
A final table summarizing everything
# R
h_table = matrix(c(Na, Ho, He, F), ncol=4, byrow=TRUE)
colnames(h_table) = c("Na", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table
## Na Ho He F
## pop 2 0.3312 0.3242941 -0.02129524
A plot comparing Ho and He
# R
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
Get the data
# R
info<-read.csv("robolo1.csv")
number of individuals
# R
nind=nrow(info)
number of loci, by removing columns of info
# R
nloci=ncol(info)-5
make genind object
# R
obj<-df2genind(info[,c(6:19)], sep="/", NA.char = "NA")
add geographic coordinates and create a first map
# R
obj@other$xy<-info[,4:5]
plot(obj$other$xy,cex=1.5,xlab='x',ylab='y')
points(obj$other$xy, col="red",pch=20)
add the (sub)population information
# R
pop(obj) <- info[,3]
obj
## /// GENIND OBJECT /////////
##
## // 79 individuals; 14 loci; 29 alleles; size: 30.3 Kb
##
## // Basic content
## @tab: 79 x 29 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-3)
## @loc.fac: locus factor for the 29 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: df2genind(X = info[, c(6:19)], sep = "/", NA.char = "NA")
##
## // Optional content
## @pop: population of each individual (group size range: 4-6)
## @other: a list containing: xy
Calculate some basic genetic parameters for the whole dataset
# R
sum<-summary(obj)
sum
##
## // Number of individuals: 79
## // Group sizes: 6 5 6 6 5 6 6 5 6 6 4 6 6 6
## // Number of alleles per locus: 2 2 2 2 3 2 2 2 2 2 2 2 2 2
## // Number of alleles per group: 25 26 25 24 25 25 25 25 28 26 26 26 26 24
## // Percentage of missing data: 0 %
## // Observed heterozygosity: 0.51 0.48 0.25 0.37 0.3 0.03 0.14 0.23 0.58 0.51 0.51 0.1 0.24 0.38
## // Expected heterozygosity: 0.47 0.46 0.26 0.47 0.3 0.05 0.21 0.2 0.5 0.48 0.48 0.14 0.21 0.48
Expected and observed hererozygosity and allelic richness
# R
Ho_all <- mean(sum$Hobs)
He_all <- mean(sum$Hexp)
F_all <- (He_all-Ho_all)/He_all
Na_all <- mean(obj$loc.n.all)
A table of diversity metrics
# R
robolo_table = matrix(c(Na_all, Ho_all, He_all, F_all), ncol=4, byrow=TRUE)
colnames(robolo_table) = c("Na", "Ho", "He", "F")
rownames(robolo_table) <- c("robolo")
robolo_table
## Na Ho He F
## robolo 2.071429 0.3300181 0.3361755 0.01831614
A plot comparing Ho and He
# R
index = matrix(c(Ho_all, He_all, F_all), ncol=3, byrow=TRUE)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
Create a genpop object
# R
objp <- genind2genpop(obj)
##
## Converting data from a genind to a genpop object...
##
## ...done.
objp
## /// GENPOP OBJECT /////////
##
## // 14 populations; 14 loci; 29 alleles; size: 15.3 Kb
##
## // Basic content
## @tab: 14 x 29 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-3)
## @loc.fac: locus factor for the 29 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: genind2genpop(x = obj)
##
## // Optional content
## @other: a list containing: xy
popNames(objp)
## [1] "Laista1" "Laista2" "Vermio1" "Vermio2" "Tymfi1"
## [6] "Tymfi2" "Metsovo1" "Metsovo2" "Smolikas1" "Smolikas2"
## [11] "Smolikas3" "Vasilitsa1" "Vasilitsa2" "Vasilitsa3"
Make a plot showing the number of genotypes per population
# R
barplot(sum$n.by.pop, main="Sample sizes per population",
ylab="Number of genotypes",las=3)
Genetic distances (Nei) between populations. We will need the package hierfstat
# R
library(hierfstat)
nei <- genet.dist(obj, method = "Dm")
nei
## Laista1 Laista2 Vermio1 Vermio2 Tymfi1
## Laista2 0.025793651
## Vermio1 0.035714286 0.018253968
## Vermio2 0.103670635 0.072023810 0.032242063
## Tymfi1 0.062698413 0.013571429 0.036111111 0.089880952
## Tymfi2 0.021329365 0.040773810 0.059027778 0.142361111 0.065773810
## Metsovo1 0.030257937 0.022519841 0.051091270 0.138888889 0.052281746
## Metsovo2 0.057460317 0.019285714 0.043968254 0.092976190 0.024285714
## Smolikas1 0.027777778 0.004761905 0.023809524 0.081845238 0.023809524
## Smolikas2 0.018849206 0.004960317 0.028769841 0.093253968 0.021626984
## Smolikas3 0.053695437 0.039508929 0.028893849 0.062872024 0.050223214
## Vasilitsa1 0.053571429 0.026388889 0.018849206 0.049107143 0.050198413
## Vasilitsa2 0.037202381 0.015972222 0.036210317 0.090277778 0.050496032
## Vasilitsa3 0.037202381 0.024305556 0.031250000 0.055059524 0.061210317
## Tymfi2 Metsovo1 Metsovo2 Smolikas1 Smolikas2
## Laista2
## Vermio1
## Vermio2
## Tymfi1
## Tymfi2
## Metsovo1 0.034722222
## Metsovo2 0.041488095 0.035138889
## Smolikas1 0.049107143 0.020337302 0.030476190
## Smolikas2 0.023313492 0.022321429 0.018769841 0.008928571
## Smolikas3 0.087425595 0.078993056 0.090223214 0.039806548 0.053695437
## Vasilitsa1 0.077876984 0.050099206 0.060436508 0.027777778 0.042658730
## Vasilitsa2 0.057539683 0.016865079 0.036924603 0.013392857 0.022321429
## Vasilitsa3 0.065476190 0.053571429 0.048829365 0.026289683 0.029265873
## Smolikas3 Vasilitsa1 Vasilitsa2
## Laista2
## Vermio1
## Vermio2
## Tymfi1
## Tymfi2
## Metsovo1
## Metsovo2
## Smolikas1
## Smolikas2
## Smolikas3
## Vasilitsa1 0.023933532
## Vasilitsa2 0.059647817 0.025297619
## Vasilitsa3 0.052207341 0.028273810 0.018849206
Constructing a neighbor joining tree using ape
# R
library(ape)
tree <- nj(nei)
plot(tree)
We will need the packages msa and seqinr
# R
library(seqinr)
library(msa)
## Warning: package 'msa' was built under R version 4.3.0
## Warning: package 'Biostrings' was built under R version 4.3.0
## Warning: package 'BiocGenerics' was built under R version 4.3.0
## Warning: package 'S4Vectors' was built under R version 4.3.0
## Warning: package 'IRanges' was built under R version 4.3.0
## Warning: package 'XVector' was built under R version 4.3.0
## Warning: package 'GenomeInfoDb' was built under R version 4.3.0
Download the “all.fasta.txt” file
# R
dna<-readDNAStringSet("all.fasta.txt")
dna
## DNAStringSet object of length 6:
## width seq names
## [1] 1542 ATGTTCACCGACCGCTGACTATT...AACCCGTGTACATAAAATCTAGA Pan_paniscus
## [2] 1542 ATGTTCACCGACCGCTGACTATT...AACCCGTATACATAAAATCTAGA Pan_troglodytes
## [3] 1542 ATGTTCGCCGACCGTTGACTATT...AACCCGTATACATAAAATCTAGA Homo_denisovan
## [4] 1542 ATGTTCGCCGACCGTTGACTATT...AGCCCGTATACATAAAATCTAGA Homo_neanderthale...
## [5] 1542 ATGTTCGCCGACCGTTGACTATT...AACCCGTATACATAAAATCTAGA Homo_sapiens
## [6] 1545 ATGTTCGTAAATCGTTGACTATA...CAACATATATCAACCTAAAATAA Sus_scrofa
We align the sequences using msa (multiple sequence alignment)
# R
zz <- msa(dna)
## use default substitution matrix
zz
## CLUSTAL 2.1
##
## Call:
## msa(dna)
##
## MsaDNAMultipleAlignment with 6 rows and 1545 columns
## aln names
## [1] ATGTTCACCGACCGCTGACTATTCTC...ACCCGTGTACATAAAATCTAGA--- Pan_paniscus
## [2] ATGTTCACCGACCGCTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Pan_troglodytes
## [3] ATGTTCGCCGACCGTTGACTATTCTC...GCCCGTATACATAAAATCTAGA--- Homo_neanderthale...
## [4] ATGTTCGCCGACCGTTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Homo_sapiens
## [5] ATGTTCGCCGACCGTTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Homo_denisovan
## [6] ATGTTCGTAAATCGTTGACTATACTC...ACCAACATATATCAACCTAAAATAA Sus_scrofa
## Con ATGTTCGCCGACCGTTGACTATTCTC...ACCCGTATACATAAAATCTAGA--- Consensus
We turn zz into an “align” object to go on finding the genetic distances using the “identitiy” method
# R
align <- msaConvert(zz, type="seqinr::alignment")
d <- dist.alignment(align, "identity")
d
## Pan_paniscus Pan_troglodytes Homo_neanderthalensis
## Pan_troglodytes 0.1972575
## Homo_neanderthalensis 0.3023900 0.2947883
## Homo_sapiens 0.3045270 0.2947883 0.1018633
## Homo_denisovan 0.3098052 0.3002377 0.1549026
## Sus_scrofa 0.4647078 0.4605022 0.4569680
## Homo_sapiens Homo_denisovan
## Pan_troglodytes
## Homo_neanderthalensis
## Homo_sapiens
## Homo_denisovan 0.1462900
## Sus_scrofa 0.4597975 0.4590918
Create the nj tree. Is it correct?
# R
tree1 <- nj(d)
plot(tree1)
We need to root the tree using the wild boar as an outgroup. Now the tree is correct!
# R
rtree<-root(tree1, "Sus_scrofa")
plot(rtree)