#### Population and Evolution Genetics - Genetic analysis of populations using R ####

### One gene locus with two alleles, equal frequencies
## Simulated data

Alleles <- c("A", "a") # alleles
p       <- 0.5         # frequency for allele A or A1
q       <- 1-p         # frequency for allele a or A2
fpq     <- c(p,q)      # combine both frequency 
Nind    <- 100         # number of individuals in a population

# We create our first random population:
pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2))
pop

# We can create a genotype matrix for analysis
x<-paste(pop[,1], pop[,2], sep = "/")
x
x1<-as.matrix(x)
x1

# Allele and genotype frequencies
t<-table(x1)
t
P<-t[4]/Nind
Q<-t[1]/Nind
H<-(t[2]+t[3])/Nind

# Table of genotype frequencies
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("pop")
genotypes

# A plot of genotype frequencies
barplot(genotypes)
# Put some colour in your life
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)

p1<-P+H/2
q1<-Q+H/2

# Table of allele frequencies
alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE)
colnames(alleles) = c("p", "q")
rownames(alleles) <- c("pop")
alleles

# A plot of allele frequencies
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)

# Some more metrics
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He

# A table comparing Ho and He
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
colnames(index) = c("Ho", "He", "F")
rownames(index) <- c("pop")
index

# A plot comparing Ho and He
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

# A final table summarizing everything
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table

# Create a function that does all this

diversity <- function(Alleles, p=0.5, Nind=100) {
q       <- 1-p         # frequency for allele a or A2
fpq     <- c(p,q)      # combine both frequency 
pop <- array(sample(Alleles, 2 * Nind, p = fpq, replace = T ), dim = c(Nind, 2))
x<-paste(pop[,1], pop[,2], sep = "/")
x1<<-as.matrix(x) # Since we need x1 for further analysis outside the function, we define it with <<-
t<-table(x1)
P<-t[4]/Nind
Q<-t[1]/Nind
H<-(t[2]+t[3])/Nind
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("pop")
p1<-P+H/2
q1<-Q+H/2
alleles = matrix(c(p1, q1), ncol=2, byrow=TRUE)
colnames(alleles) = c("p", "q")
rownames(alleles) <- c("pop")
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
colnames(index) = c("Ho", "He", "F")
rownames(index) <- c("pop")
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
print(h_table)
# A multiple plot
par(mfrow=c(1,3))
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)
}

# Now lets try different allele frequencies and population sizes. What do you observe?

diversity(Alleles,0.5,100)
diversity(Alleles,0.2,10000)
diversity(Alleles,0.1,10000)
diversity(Alleles,0.2,10)

## Real Data: Using our new function to measure genetic diversity in a real case: Hop population in Central Greece

# import data file with 9 SNPs
hop<-read.csv("hop.csv", header=TRUE)

# We create a genotype matrix for analysis for SNP_1
hop1<-hop[,2]
y1<-as.matrix(hop1)
Nind <- length(y1)         # number of individuals in a population

# Allele and genotype frequencies
z<-table(y1)
z
P<-z[1]/Nind
Q<-z[3]/Nind
H<-z[2]/Nind

p1<-P+H/2
q1<-Q+H/2

# Table of genotype frequencies
genotypes = matrix(c(P, H, Q), ncol=3, byrow=TRUE)
colnames(genotypes) = c("P", "H", "Q")
rownames(genotypes) <- c("hop")
genotypes

# Some more metrics
Ho <- unname(H)
He <- 1-((p1^2)+(q1^2))
F <- (He-Ho)/He

# A final table summarizing everything
h_table = matrix(c(Nind, p1, q1, Ho, He, F), ncol=6, byrow=TRUE)
colnames(h_table) = c("N", "p", "q", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table

# A combined plot
par(mfrow=c(1,3))
barplot(genotypes,
main = "Genotype Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="red",
col="blue",
density=10)
barplot(alleles,
main = "Allele Frequencies",
ylab = "Frequency",
ylim = c(0,1),
border="blue",
col="red",
density=10)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

# Can you repeat this analysis for the second SNP?

## Simulated data: Analysis using adegenet

# Let's create a datafile using our function
diversity(Alleles,0.2,10000)

# Call the adegenet library
library(adegenet)

# Create a geneind object
x2<-df2genind(x1, sep="/")
x2
x3<-summary(x2)
x3

# Some metrics
Ho <- mean(x3$Hobs)
He <- mean(x3$Hexp)
F <- (He-Ho)/He
Na <- mean(x2$loc.n.all)

# A final table summarizing everything
h_table = matrix(c(Na, Ho, He, F), ncol=4, byrow=TRUE)
colnames(h_table) = c("Na", "Ho", "He", "F")
rownames(h_table) <- c("pop")
h_table

# A plot comparing Ho and He
index = matrix(c(Ho, He, F), ncol=3, byrow=TRUE)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

### More Loci and more populations: diversity and differentiation
## Real data: Pinus heldreichii in North Greece

# Get the data
info<-read.csv("robolo1.csv")
# number of individuals
nind=nrow(info)
# number of loci, by removing columns of info
nloci=ncol(info)-5

# make genind object 
obj<-df2genind(info[,c(6:19)], sep="/", NA.char = "NA")

# add geographic coordinates and create a first map
obj@other$xy<-info[,4:5]
plot(obj$other$xy,cex=1.5,xlab='x',ylab='y')
points(obj$other$xy, col="red",pch=20)

# add the (sub)population information
pop(obj) <- info[,3]
obj

# Calculate some basic genetic parameters for the whole dataset
sum<-summary(obj)
sum
# Expected and observed hererozygosity and allelic richness
Ho_all <- mean(sum$Hobs)
He_all <- mean(sum$Hexp)
F_all <- (He_all-Ho_all)/He_all
Na_all <- mean(obj$loc.n.all)

# A table of diversity metrics
robolo_table = matrix(c(Na_all, Ho_all, He_all, F_all), ncol=4, byrow=TRUE)
colnames(robolo_table) = c("Na", "Ho", "He", "F")
rownames(robolo_table) <- c("robolo")
robolo_table

# A plot comparing Ho and He
index = matrix(c(Ho_all, He_all, F_all), ncol=3, byrow=TRUE)
fancy<-c("darkred", "darkgreen")
barplot(index[,1:2],
main = "Heterozygosity (observed/expected)",
ylim = c(0,1),
col=fancy,
density=70)

# Create a genpop object
objp <- genind2genpop(obj)
objp
popNames(objp)

# Make a plot showing the number of genotypes per population
barplot(sum$n.by.pop, main="Sample sizes per population",
ylab="Number of genotypes",las=3)

# Genetic distances (Nei) between populations. We will need the package hierfstat
library(hierfstat)
nei <- genet.dist(obj, method = "Dm")
nei

# Constructing a neighbor joining tree using ape
library(ape)
tree <- nj(nei)

### Creating a phylogenetic tree among species using homologous genes
## The cytochrome C oxidase I (COI) gene in hominids

# We will need the packages msa and seqinr
library(seqinr)
library(msa)

# Download the "all.fasta.txt" file
dna<-readDNAStringSet("all.fasta.txt")
dna

# we align the sequences using msa (multiple sequence alignment)
zz <- msa(dna)
zz

# We turn zz into an "align" object to go on finding the genetic distances using the "identitiy" method
align <- msaConvert(zz, type="seqinr::alignment")
d <- dist.alignment(align, "identity")
d

# Create the nj tree. Is it correct?
tree1 <- nj(d)
plot(tree1)

# We need to root the tree using the wild boar as an outgroup. Now the tree is correct!
rtree<-root(tree1, "Sus_scrofa")
plot(rtree)