Changes

Taylor Priest · 65e9179e
--- a/Practical-3-Processing-16S-rRNA-amplicon-data.md
+++ b/Practical-3-Processing-16S-rRNA-amplicon-data.md
@@ -94,86 +94,180 @@ The final step is then to classify the sequences, using the silva database we do
    > taxa <- assignTaxonomy(seqtab.nochim, "silva_nr99_v138_train_set.fa.gz", multithread=8)
-Now lets save our results, so that we can use them again later without having to do all the calculations again:
+Now lets save our results, so that we can use them again later without having to do all the calculations again. Before doing this, let's create a new directory that will contain all of our finished outputs.
-    > saveRDS(seqtab.nochim,"seqtab_final.rds")
+    > dir.create("dada_output") # This will be created in our working directory
-    > saveRDS(taxa, "taxa_final.rds")
-Now that we have these outputs, we need to make them useful! One way to do that is to use the R package phyloseq, which will integrate all kinds of fun things. You can check out the phyloseq/DADA2 interaction on the DADA2 website: https://benjjneb.github.io/dada2/tutorial.html
+Now go ahead and save the files
-For now though, we're just going to use a (relatively) simple plotting function to inspect the sequence diversity in our samples. If you get bored you can try and puzzle through what the plot function is doing and start changing things. But if you're not bothered, you still get some nice plots :) 
+    > saveRDS(seqtab.nochim,"seqtab_final.rds")
+    > saveRDS(taxa, "taxa_final.rds")
-# Start from here on Tuesday
+The files saved above are the standard DADA2 files but we can make them more readable and in a better format to use with other tools also.
-We're all going to start today by making some plots. Sorry if the full dada processing took too long and you didn't get to run it all yourself. If you didn't, all you missed was running some R code and inspecting big unwieldy tables. 
+The following lines of code are simply editing the data files and saving them so that you have the final data in the best possible format for further analysis.
-Start by getting the full processed dada data:
+    ## Generate ASV fasta file
+View(seqtab.nochim)
+asv_seqs <- colnames(seqtab.nochim)
+View(asv_seqs)
+asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
+View(asv_headers)
+for (i in 1:dim(seqtab.nochim)[2]) {asv_headers[i] <- paste(">ASV", i, sep="_")}
+asv_fasta <- c(rbind(asv_headers, asv_seqs))
+View(asv_fasta)
+write(asv_fasta, file="./dada_output/ASV_representatives.fasta")
+    ## Generate ASV count table
+asv_count_tab <- t(seqtab.nochim)
+row.names(asv_count_tab) <- sub(">", "", asv_headers)
+write.table(asv_count_tab, file="./dada_output/ASV_representatives_count_table.tsv",
+            sep="\t", quote=F, col.names=NA)
+    ## Edit the taxa table produced earlier to include ASV name instead of sequence
+row.names(taxa_with_species) <- sub(">", "", asv_headers)
+View(taxa_with_species)
+write.table(taxa_with_species, file="./dada_output/ASV_representatives_taxa_table.tsv",
+            sep="\t", quote=F, col.names=NA)
+We should now have three output files that contain all of the information in a nice and clear format:
+ASV_representatives.fasta = fasta file with our ASV sequences
+ASV_representatives_count_table.tsv = table containing ASVs and their counts across samples
+ASV_representatives_taxa_table.tsv = table containing ASVs and their taxonomy
+**Visualise the data**
+Great job if you have managed to get everything working and the output files produced. However, if you couldn't manage too, then don't worry! We have already got the output files for you.
-    # you don't need this if you created these objects yourself already
    $ cp /bioinf/home/tfrancis/marmic2021/demultiplexed/seqtab_final.rds .
    $ cp /bioinf/home/tfrancis/marmic2021/demultiplexed/taxa_final.rds .
-Then get R with conda: 
+Now we are going to use the produced files in order to make some plots and visualise the data. To do this, we are going to use the phyloseq package. Phyloseq is a very powerful tool for ecological analysis. It allows you to combine together lots of information into a single object, e.g. count data, taxa data, sample metadata, phylogenetic trees etc.
-    $ conda install -c r r
-    $ conda install -c r r-essentials
-    # then start R
+First, let's load the necessary libraries:
-    $ R
-load the data:
+    library(phyloseq)
-    > seqtab.nochim <- readRDS("seqtab_final.rds")
-    > taxa <- readRDS("taxa_final.rds")
-then do the plotting :)
-```
    library(ggplot2)
    library(reshape2)
    library(RColorBrewer)
+    library(vegan)
+    library(plyr)
+Before we create our phyloseq object, we can import some metadata about our samples, e.g. Date, location etc.
+    herschel_metadata <- read.table("../herschel_sample_meta.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
+    rownames(herschel_metadata) <- herschel_metadata$Sample
+    View(herschel_metadata)
+Now let's create our phyloseq object
+    physeq_herschel <- phyloseq(otu_table(asv_count_tab, taxa_are_rows=TRUE), 
+                        tax_table(taxa_with_species), sample_data(herschel_metadata))
+Once you have created your phyloseq object, the datasets can be accessed by using the following syntax
+    otu_table(physeq_herschel)
+    tax_table(physeq_herschel)
+    sample_data(physeq_herschel)
+Make some basic diversity plots and rarefaction curves
+    ## Diversity plots
+    alpha_plot_herschel <- plot_richness(physeq_herschel, measures=c("Chao1", "Shannon", "Simpson"), x="sample", nrow=3)
+    alpha_plot_herschel
+    ## Rarefaction curves to provide indication of the coverage of our community
+    rarefaction_herschel <- rarecurve(t(otu_table(physeq_herschel)), step=50, cex=0.5)
+    rarefaction_herschel
+Now let's visualise the composition of our sample communities using basic barplots. For this, we first need to calculate relative abundance
+    physeq_herschel_rel <- transform_sample_counts(physeq_herschel, function(otu) otu/sum(out))
+    physeq_herschel_rel
+    herschel_phylum_barplot <- plot_bar(physeq_herschel_rel, fill="Phylum") +
+facet_grid(~Transect, scales="free_x") + 
+  #scale_fill_manual(values=getPalette) + 
+  guides(fill=guide_legend(ncol=1)) + 
+  labs(y="Relative abundance", x="Sample") + 
+  theme_bw() + 
+  theme(axis.title.x = element_text(size=12,face="bold",colour="black"), 
+        axis.title.y = element_text(size=12,face="bold",colour="black"),
+        axis.text.x = element_text(size=10, colour="black", angle=45, hjust=1), 
+        legend.title = element_text(size=12,face="bold",colour="black"),
+        strip.background.x = element_rect(fill="white"), 
+        strip.text.x = element_text(size=11,face="bold",colour="black"))
+herschel_phylum_barplot
+## Now at the Family level
+herschel_family_barplot <- plot_bar(physeq_herschel_rel, fill="Family") + 
+  facet_grid(~Transect, scales="free_x") + 
+  #scale_fill_manual(values=getPalette) + 
+  guides(fill=guide_legend(ncol=1)) + 
+  labs(y="Relative abundance", x="Sample") + 
+  theme_bw() + 
+  theme(axis.title.x = element_text(size=12,face="bold",colour="black"), 
+        axis.title.y = element_text(size=12,face="bold",colour="black"),
+        axis.text.x = element_text(size=10, colour="black", angle=45, hjust=1), 
+        legend.title = element_text(size=12,face="bold",colour="black"), 
+        strip.background.x = element_rect(fill="white"), 
+        strip.text.x = element_text(size=11,face="bold",colour="black"))
+herschel_family_barplot
+## WAAAAYYY TOO MANY FAMILIES
+## What should we do?
+## This is common with microbial datasets and the best solution is to apply a 
+## threshold, to only show those that are at least x% in abundance
+## Before we apply a threshold though, it is good to visualise the occurence
+## of ASVs before determining a justifiable threshold
+filtered = genefilter_sample(physeq_herschel_rel, filterfun_sample(function(x) x > 0.01), A=0.05*nsamples(physeq_herschel_rel))
+filtered
+filtered_taxa = prune_taxa(filtered, physeq_herschel_rel)
+filtered_taxa
+## Let's repeat the family level barplot with this filtered dataset
+herschel_filtered_family_barplot <- plot_bar(filtered_taxa, fill="Family") + 
+  facet_grid(~Transect, scales="free_x") + 
+  #scale_fill_manual(values=getPalette) + 
+  guides(fill=guide_legend(ncol=1)) + 
+  labs(y="Relative abundance", x="Sample") + 
+  theme_bw() + 
+  theme(axis.title.x = element_text(size=12,face="bold",colour="black"), 
+        axis.title.y = element_text(size=12,face="bold",colour="black"),
+        axis.text.x = element_text(size=10, colour="black", angle=45, hjust=1), 
+        legend.title = element_text(size=12,face="bold",colour="black"), 
+        strip.background.x = element_rect(fill="white"), 
+        strip.text.x = element_text(size=11,face="bold",colour="black"))
+herschel_filtered_family_barplot
+## Hard to distinguish these colours, so let's try changing the colour scheme
+## First we extract the data from the plot above
+## Then we will assign colours to families and replot
+## This time for the plotting we will use ggplot itself (which is what phyloseq was using above)
+herschel_filtered_family_barplot_data <- herschel_filtered_family_barplot$data
+View(herschel_filtered_family_barplot_data)
+herschel_filtered_family_barplot_data[is.na(herschel_filtered_family_barplot_data)] <- "Unknown"
+colourCount=length(unique(herschel_filtered_family_barplot_data$Family))
+herschel_filtered_family_barplot_refined <- ggplot(herschel_filtered_family_barplot_data, 
+                                                   aes(x=Sample, y=Abundance)) +
+  geom_bar(aes(fill=Family), stat="identity", position="stack") + 
+  facet_grid(~Transect, scales="free_x") + 
+  #scale_fill_manual(values=getPalette) + 
+  guides(fill=guide_legend(ncol=1)) + 
+  labs(y="Relative abundance", x="Sample") + 
+  scale_fill_manual(values=colorRampPalette(brewer.pal(12,"Paired"))(colourCount)) + 
+  theme_bw() + 
+  theme(axis.title.x = element_text(size=12,face="bold",colour="black"), 
+        axis.title.y = element_text(size=12,face="bold",colour="black"),
+        axis.text.x = element_text(size=10, colour="black", angle=45, hjust=1), 
+        legend.title = element_text(size=12,face="bold",colour="black"), 
+        strip.background.x = element_rect(fill="white"), 
+        strip.text.x = element_text(size=11,face="bold",colour="black"))
+herschel_filtered_family_barplot_refined
-plotTaxon <- function(seqtab, taxtable, taxon, min_abund, taxonomicLevel) {
-  seqtab <- data.frame(seqtab)
-  colnames(seqtab) <- paste(taxtable[, 1], taxtable[, 2], taxtable[, 3], taxtable[, 4], taxtable[, 5], taxtable[, 6], as.character(1:length(taxtable[, 1])), sep="-")
-  seqtab <- seqtab[, grep("Chloroplast|Mitochondria", colnames(seqtab), invert=TRUE)]
-  seqtab <- seqtab/rowSums(seqtab)
-  seqtab1 <- seqtab[, sapply(seqtab, function(x) max(x)) >= min_abund]
-  seqtab2 <- seqtab[, sapply(seqtab, function(x) max(x)) < min_abund]
-  colnames(seqtab2) <- paste0(colnames(seqtab2), "_Others")
-  seqtab3 <- merge(seqtab1, seqtab2, by="row.names")
-  row.names(seqtab3) <- row.names(seqtab)
-  seqtab <- seqtab3
-  seqtab$sample <- rownames(seqtab)
-  seqtab.m <- melt(seqtab)
-  seqtab.m <- seqtab.m[grep(taxon, seqtab.m$variable),]
-  seqtab.m$variable <- gsub("NA", "uncultured", seqtab.m$variable)
-  seqtab.m$variable <- gsub(".*Others", "Below_Min_Abund-Below_Min_Abund-Below_Min_Abund-Below_Min_Abund-Below_Min_Abund-Below_Min_Abund-Below_Min_Abund", seqtab.m$variable)
-  seqtab.m$variable <- as.character(seqtab.m$variable
-  seqtab.m$Domain <- sapply(strsplit(seqtab.m$variable, "-"), "[[", 1)
-  seqtab.m$Phylum <- sapply(strsplit(seqtab.m$variable, "-"), "[[", 2)
-  seqtab.m$Class <- sapply(strsplit(seqtab.m$variable, "-"), "[[", 3)
-  seqtab.m$Order <- sapply(strsplit(seqtab.m$variable, "-"), "[[", 4)
-  seqtab.m$Family <- sapply(strsplit(seqtab.m$variable, "-"), "[[", 5)
-  seqtab.m$Genus <- sapply(strsplit(seqtab.m$variable, "-"), "[[", 6)
-  seqtab.m <- cbind(seqtab.m, replicate(1,seqtab.m[colnames(seqtab.m) == taxonomicLevel]))
-  colnames(seqtab.m) <- c(colnames(seqtab.m)[1:length(colnames(seqtab.m)) -1], "displayLevel")
-  nb.cols <- length(seqtab) - 1
-  mycolours <- rep(brewer.pal(12, "Paired"), ceiling(nb.cols/12))
-  ggplot(seqtab.m) +
-    geom_bar(aes(x=sample, y=value, fill=displayLevel), stat="identity", position="stack") +
-    scale_fill_manual(values=mycolours) +
-    labs(x="Sample", y="Proportion of Reads", fill=taxonomicLevel) +
-    theme_classic() +
-    theme(text=element_text(family="Serif", size=16)) +
-    theme(axis.text.x=element_text(angle=45, hjust=1))
-}
-```
-So the bit above creates the function called `plotTaxon()`, which we can then use to generate the plots. 
-    > plotTaxon(seqtab.nochim, taxa, "Bacteroidota", 0.01, "Genus")
-Happy plotting!
-(Hint - try Desulfobacterota (with reduced minimum abundance) or Campilobacterota, or try changing the taxonomic level you view, to see what inferences you can make about your samples)
-If you want to look at subsets of the data, which would be a good idea, try selecting just a handful of rows of interest from the `seqtab.nochim` object and feeding that into the plotting function. R has a `grep()` function, that you'll need to look up. The sample names you'll want to search in are in `row.names(seqtab.nochim)`. Subsetting an R object is done by using  square brackets `[]` after the name of the object you want to subset.
\ No newline at end of file