# Import the microbiome study data object (Tree Summarized Experiment)
tse <- readRDS("O'Ferrall_snails_2024.rds")
## Attach packages
library(knitr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(mia)
## Loading required package: MultiAssayExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Loading required package: SingleCellExperiment
## Loading required package: TreeSummarizedExperiment
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## This is mia version 1.14.0
## - Online documentation and vignettes: https://microbiome.github.io/mia/
## - Online book 'Orchestrating Microbiome Analysis (OMA)': https://microbiome.github.io/OMA/docs/devel/
##
## Attaching package: 'mia'
## The following objects are masked from 'package:dplyr':
##
## full_join, inner_join, left_join, right_join
library(miaViz)
## Loading required package: ggraph
##
## Attaching package: 'miaViz'
## The following object is masked from 'package:mia':
##
## plotNMDS
library(pheatmap)
library(ggrepel)
library(patchwork)
library(scater)
## Loading required package: scuttle
Taxonomy and recovered metagenome assembled genomes (MAGs)
# Calculate the number of unique taxa for each taxonomic rank
apply(rowData(tse), 2, function (x) {length(unique(x))})
## domain phylum class order family genus species
## 1 39 73 160 352 1246 4773
## clade_name
## 4773
# Print total number of high-quality MAGs
print(sum(colData(tse)$high_qual_MAG))
## [1] 8
# Print total number of medium-quality MAGs
print(sum(colData(tse)$mid_qual_MAG))
## [1] 21
Phyla
# Make a relative abundance matrix for phyla
relabundance_matrix_phylum <- assay(altExp(tse, "phylum"), "relabundance")
# Calculate the total relative abundance of each phylum (row sums)
total_relabundance_phylum <- rowSums(relabundance_matrix_phylum)
# Identify the top 7 phyla (NB. these have mean relative abundance > 1%)
top_phyla <- names(sort(total_relabundance_phylum, decreasing = TRUE)[1:7])
# Make a new TSE where the phyla with relative abundance > 1% are recognized, while others are "-Other"
tse_top_7_phyla <- altExp(tse, "phylum")
rowData(tse_top_7_phyla)$phylum <- ifelse(rowData(tse_top_7_phyla)$phylum %in% top_phyla, rowData(tse_top_7_phyla)$phylum, "-Other")
rowData(tse_top_7_phyla)$Rank <- gsub("^p__", "", rowData(tse_top_7_phyla)$phylum)
# Define custom colors
custom_phyla_colors <- c(
"Actinobacteria" = "#F8766D",
"Bacteroidetes" = "#7CAE00",
"Cyanobacteria" = "#00BFC4",
"Deinococcus-Thermus" = "#C77CFF",
"Firmicutes" = "#A0522D",
"Planctomycetes" = "#FF69B4",
"Proteobacteria" = "#1F4E79",
"-Other" = "#BEBEBE"
)
# Plot the abundance
phyla_plot <- plotAbundance(tse_top_7_phyla, assay.type = "relabundance", rank = "Rank", add_x_text = TRUE) +
scale_fill_manual(values = custom_phyla_colors, name = "Phylum") +
scale_colour_manual(values = custom_phyla_colors, name = "Phylum") +
guides(fill = guide_legend(override.aes = list(color = NA))) +
theme(plot.margin = margin(t = 30, r = 10, b = 10, l = 10))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Save the plot
ggsave("phyla_plot.png", plot = phyla_plot, width = 7, height = 5, dpi = 600)
# Print the plot
print(phyla_plot)

Calculate Proteobacteria mean relative abundance
# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_phylum <- relabundance_matrix_phylum * 100
# Extract Proteobacteria
proteobacteria_abundance <- relabundance_matrix_phylum["p__Proteobacteria", , drop = FALSE]
# Calculate mean Proteobacteria relative abundance
mean_proteobacteria_relabund <- sum(proteobacteria_abundance) / 15
print(mean_proteobacteria_relabund)
## [1] 66.27109
Orders
# Make a relative abundance matrix for order
relabundance_matrix_order <- assay(altExp(tse, "order"), "relabundance")
# Calculate the total relative abundance of each order (row sums)
total_relabundance_order <- rowSums(relabundance_matrix_order)
# Identify the top 20 orders (NB. these have mean relative abundance > 1%)
top_orders <- names(sort(total_relabundance_order, decreasing = TRUE)[1:20])
# Make a new TSE where the orders with relative abundance > 1% are recognized, while others are "-Other"
tse_top_20_orders <- altExp(tse, "order")
rowData(tse_top_20_orders)$order <- ifelse(rowData(tse_top_20_orders)$order %in% top_orders, rowData(tse_top_20_orders)$order, "-Other")
rowData(tse_top_20_orders)$Rank <- gsub("^o__", "", rowData(tse_top_20_orders)$order)
custom_order_colors <- c(
"-Other" = "#BEBEBE",
"Flavobacteriales" = "yellow",
"Deinococcales" = "#377EB8",
"Nostocales" = "#4DAF4A",
"Bacillales" = "#984EA3",
"Micromonosporales" = "#FF7F00",
"Propionibacteriales" = "grey20",
"Streptomycetales" = "#A65628",
"Corynebacteriales" = "#00441B",
"Micrococcales" = "blue",
"Xanthomonadales" = "#66C2A5",
"Enterobacterales" = "#FC8D62",
"Pseudomonadales" = "#8DA0CB",
"Aeromonadales" = "#D73027",
"Caulobacterales" = "#A6D854",
"Rhodospirillales" = "#E6AB02",
"Sphingomonadales" = "#E5C494",
"Rhodobacterales" = "#54278F",
"Rhizobiales" = "#D95F02",
"Rhodocyclales" = "#7570B3",
"Burkholderiales" = "#1B9E77"
)
# Plot the abundance
order_plot <- plotAbundance(tse_top_20_orders, assay.type = "relabundance", rank = "Rank", add_x_text = TRUE) +
scale_fill_manual(values = custom_order_colors, name = "Order") +
scale_colour_manual(values = custom_order_colors, name = "Order") +
guides(fill = guide_legend(override.aes = list(color = NA)))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Save the plot
ggsave("order_plot.png", plot = order_plot, width = 8.2, height = 5, dpi = 600)
# Print the plot
print(order_plot)

Calculate Enterobacterales mean relative abundance
# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_order <- relabundance_matrix_order * 100
# Extract Enterobacterales
enterobacterales_abundance <- relabundance_matrix_order["o__Enterobacterales", , drop = FALSE]
# Calculate mean Enterobacterales relative abundance
mean_enterobacterales_relabund <- sum(enterobacterales_abundance) / 15
print(mean_enterobacterales_relabund)
## [1] 4.785705
Calculate Enterobacterales mean relative abundance with outlier
(Uk1) removed
# Remove Uk1 from the Enterobacterales relative abundance dataframe
enterobacterales_abundance_minus_Uk1 <- as.data.frame(enterobacterales_abundance)
enterobacterales_abundance_minus_Uk1$Uk1 <- NULL
# Calculate mean Enterobacterales relative abundance without Uk1
mean_enterobacterales_relabund_minus_Uk1 <- sum(enterobacterales_abundance_minus_Uk1) / 14
print(mean_enterobacterales_relabund_minus_Uk1)
## [1] 2.14886
Exploring relative abundance of Shewanella (genus level)
# Make a relative abundance matrix for genus
relabundance_matrix_genus <- assay(altExp(tse, "genus"), "relabundance")
# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_genus <- relabundance_matrix_genus * 100
# Extract Shewanella
shewanella_abundance <- relabundance_matrix_genus["g__Shewanella", , drop = FALSE]
# Print Shewanella relabundance
print(shewanella_abundance)
## M1 M2 M3 M4 M5 M6
## g__Shewanella 0.07431693 0.07474352 0.09481099 0.2391963 5.065212 0.06392155
## Ug1 Ug2 Ug3 Z1 Z2 Z3
## g__Shewanella 0.09437259 0.2747101 0.07968018 0.05922727 0.08122704 0.1393015
## Z4 Uk1 Uk2
## g__Shewanella 0.08427348 0.09382548 1.328371
# Calculate mean Shewanella relative abundance
mean_shewanella_relabundance <- sum(shewanella_abundance) / 15
print(mean_shewanella_relabundance)
## [1] 0.523146
# Remove M5 and Uk2 from the Shewanella relative abundance dataframe
shewanella_abundance_minus_M5_Uk2 <- as.data.frame(shewanella_abundance)
shewanella_abundance_minus_M5_Uk2$M5 <- NULL
shewanella_abundance_minus_M5_Uk2$Uk2 <- NULL
# Calculate mean Shewanella relative abundance without M5 and Uk2 from which OXA-48-like genes were later identified
mean_shewanella_relabundance_minus_M5_Uk2 <- sum(shewanella_abundance_minus_M5_Uk2) / 13
print(mean_shewanella_relabundance_minus_M5_Uk2)
## [1] 0.1118159
Exploring relative abundance of Aeromonas (genus level)
# Extract Aeromonas
aeromonas_abundance <- relabundance_matrix_genus["g__Aeromonas", , drop = FALSE]
# Print Aeromonas relabundance
print(aeromonas_abundance)
## M1 M2 M3 M4 M5 M6 Ug1
## g__Aeromonas 6.476615 1.611563 6.747668 12.41048 53.81697 1.920293 3.322136
## Ug2 Ug3 Z1 Z2 Z3 Z4 Uk1
## g__Aeromonas 6.234634 29.57037 4.387261 3.711667 5.622518 4.802656 1.444432
## Uk2
## g__Aeromonas 4.5081
# Calculate mean Aeromonas relative abundance
mean_aeromonas_relabundance <- sum(aeromonas_abundance) / 15
print(mean_aeromonas_relabundance)
## [1] 9.772491
Hierarchical clustering of samples and taxa
# Obtain read count data for the top 30 Orders
tse_order <- altExp(tse, "order")
# Add clr-transformation
tse_order <- transformAssay(tse_order, assay.type = "counts", method = "relabundance")
tse_order <- transformAssay(tse_order, assay.type = "relabundance",
method = "clr", pseudocount = TRUE)
## A pseudocount of 4.57444733324424e-07 was applied.
# Apply a z-transformation across rows (taxa)
tse_order <- transformAssay(tse_order, assay.type = "clr",
MARGIN = "features",
method = "standardize", name = "clr_z")
# Get 50 most abundant orders, and subsets the data by them
top_50_orders <- names(sort(total_relabundance_order, decreasing = TRUE)[1:50])
tse_order_subset <- tse_order[top_50_orders, ]
# Calculate the proportion of bacteria accounted for in the top 50 Orders
(sum(assay(tse_order_subset, "relabundance"))) / 15
## [1] 0.9792133
# Gets the assay table
mat <- assay(tse_order_subset, "clr_z")
# Make a dataframe for sample information
sample_data <- as.data.frame(colData(tse_order_subset))
sample_data <- sample_data[, c("Continent", "Administrative_region", "snail_genus"), drop = FALSE]
names(sample_data)[names(sample_data) == "snail_genus"] <- "Snail genus"
names(sample_data)[names(sample_data) == "Administrative_region"] <- "Administrative region"
# Make a dataframe for taxonomic information
taxonomic_data <- as.data.frame(rowData(tse_order_subset))
taxonomic_data <- taxonomic_data[, "phylum", drop = FALSE]
colnames(taxonomic_data)[colnames(taxonomic_data) == "phylum"] <- "Phylum"
taxonomic_data$Phylum <- sub("^p__", "", taxonomic_data$Phylum)
# Set heatmap scaling and colors
breaks <- seq(-ceiling(max(abs(mat))), ceiling(max(abs(mat))),
length.out = ifelse( max(abs(mat))>5, 2*ceiling(max(abs(mat))), 10 ) )
colors <- colorRampPalette(c("darkblue", "blue", "white", "red", "darkred"))(length(breaks)-1)
# Set column annotation colors
ann_colors <- list(
"Administrative region" = c(
"Malawi" = "forestgreen",
"Uganda" = "yellowgreen",
"United Kingdom" = "greenyellow",
"Zanzibar" = "yellow2"
),
Phylum = c(
"Actinobacteria" = "#1B9E77",
"Bacteroidetes" = "#66C2A5",
"Cyanobacteria" = "#3288BD",
"Deinococcus-Thermus" = "#5E4FA2",
"Firmicutes" = "#ABDDA4",
"Planctomycetes" = "#A6CEE3",
"Proteobacteria" = "#8DA0CB",
"Tenericutes" = "#E0F3F8",
"Verrucomicrobia" = "#00441B"
),
"Snail genus" = c(
"Anisus" = "darkorange2",
"Biomphalaria" = "orange",
"Bulinus" = "red",
"Cleopatra" = "red3",
"Lanistes" = "tan4",
"Lymnaea" = "bisque3",
"Physella" = "bisque",
"Pila" = "goldenrod3"
),
Continent = c(
"Africa" = "grey90",
"Europe" = "grey35"
)
)
# Create the heatmap
order_clusters <- pheatmap(mat, annotation_col = sample_data, annotation_row = taxonomic_data, annotation_colors = ann_colors, color = colors, breaks = breaks, legend_width = 0.5)
# Save the plot
ggsave("order_clusters.png", plot = order_clusters, width = 10, height = 8, dpi = 600)
# Print plot
print(order_clusters)

Diversity
Alpha diversity
# Calculate diversity statistics
tse <- addAlpha(tse, index = "shannon")
tse <- addAlpha(tse, index = "gini_simpson")
# Print Shannon's diversity
print(colData(tse)$shannon)
## [1] 7.039099 6.962225 7.282968 6.887802 3.881060 5.815881 6.333669 6.445851
## [9] 6.051164 6.954845 7.372464 7.245500 7.300361 4.835209 6.701276
# Print mean Shannon's diversity
print(mean(colData(tse)$shannon))
## [1] 6.473958
# Print Shannon's diversity median
print(median(colData(tse)$shannon))
## [1] 6.887802
# Print Simpson's diversity
print(colData(tse)$gini_simpson)
## [1] 0.9964133 0.9951006 0.9969621 0.9914432 0.7925213 0.9738414 0.9864923
## [8] 0.9911865 0.9715998 0.9939893 0.9986682 0.9978338 0.9972464 0.8985658
## [15] 0.9941253
# Print mean Simpson's diversity
print(mean(colData(tse)$gini_simpson))
## [1] 0.9717326
# Print Simpson's diversity median
print(median(colData(tse)$gini_simpson))
## [1] 0.9939893
Proteobacteria vs alpha-diversity
# Add Proteobacteria relative abundance to colData
tse_proteobacteria <- tse[rowData(tse)$phylum %in% c("p__Proteobacteria") & !is.na(rowData(tse)$phylum), ]
proteobacteria_relabund <- colSums(assay(tse_proteobacteria, "relabundance"))
colData(tse)$proteobacteria_relabund <- proteobacteria_relabund
# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$shannon
# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
# Print correlation coefficient and its associated p-value
print(cor_test)
##
## Spearman's rank correlation rho
##
## data: variable1 and variable2
## S = 916, p-value = 0.01287
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.6357143
Beta diversity
# Generate beta-diversity statistics (Bray-Curtis distance)
tse <- addDissimilarity(tse, assay.type = "relabundance" , method = "bray")
bray_data <- metadata(tse)[["bray"]]
# Print values
print(bray_data)
## M1 M2 M3 M4 M5 M6 Ug1
## M1 0.0000000 0.3950959 0.2578856 0.3099759 0.7660522 0.5444646 0.4956978
## M2 0.3950959 0.0000000 0.4055159 0.4547122 0.7835672 0.5872936 0.4744968
## M3 0.2578856 0.4055159 0.0000000 0.2293442 0.7544281 0.5129921 0.5095420
## M4 0.3099759 0.4547122 0.2293442 0.0000000 0.7060798 0.5080983 0.5440057
## M5 0.7660522 0.7835672 0.7544281 0.7060798 0.0000000 0.8186170 0.7946426
## M6 0.5444646 0.5872936 0.5129921 0.5080983 0.8186170 0.0000000 0.6178323
## Ug1 0.4956978 0.4744968 0.5095420 0.5440057 0.7946426 0.6178323 0.0000000
## Ug2 0.3837108 0.4853790 0.4557216 0.4802522 0.7355626 0.5711390 0.4613960
## Ug3 0.3780083 0.4766061 0.3650904 0.3433664 0.6036607 0.5409692 0.5308463
## Z1 0.4998875 0.5227628 0.4218702 0.4748654 0.7826303 0.6387735 0.5707443
## Z2 0.4271759 0.4772067 0.3131357 0.3968051 0.8023832 0.6310114 0.5866067
## Z3 0.3362135 0.4131174 0.2451802 0.3423994 0.7357711 0.5558053 0.5162808
## Z4 0.3727922 0.3795538 0.2517945 0.3499632 0.7418444 0.5907308 0.5478100
## Uk1 0.7194104 0.7556882 0.7674740 0.7746296 0.8438423 0.7702989 0.7174539
## Uk2 0.4059672 0.4191871 0.4452232 0.4895973 0.7560532 0.5714587 0.5117115
## Ug2 Ug3 Z1 Z2 Z3 Z4 Uk1
## M1 0.3837108 0.3780083 0.4998875 0.4271759 0.3362135 0.3727922 0.7194104
## M2 0.4853790 0.4766061 0.5227628 0.4772067 0.4131174 0.3795538 0.7556882
## M3 0.4557216 0.3650904 0.4218702 0.3131357 0.2451802 0.2517945 0.7674740
## M4 0.4802522 0.3433664 0.4748654 0.3968051 0.3423994 0.3499632 0.7746296
## M5 0.7355626 0.6036607 0.7826303 0.8023832 0.7357711 0.7418444 0.8438423
## M6 0.5711390 0.5409692 0.6387735 0.6310114 0.5558053 0.5907308 0.7702989
## Ug1 0.4613960 0.5308463 0.5707443 0.5866067 0.5162808 0.5478100 0.7174539
## Ug2 0.0000000 0.4380097 0.5745889 0.5747490 0.4334179 0.5110393 0.7113661
## Ug3 0.4380097 0.0000000 0.5670455 0.5193736 0.4169732 0.4651173 0.7617499
## Z1 0.5745889 0.5670455 0.0000000 0.3211763 0.3837895 0.4210765 0.8087910
## Z2 0.5747490 0.5193736 0.3211763 0.0000000 0.2714854 0.2517693 0.8068478
## Z3 0.4334179 0.4169732 0.3837895 0.2714854 0.0000000 0.2320110 0.7428368
## Z4 0.5110393 0.4651173 0.4210765 0.2517693 0.2320110 0.0000000 0.7765925
## Uk1 0.7113661 0.7617499 0.8087910 0.8068478 0.7428368 0.7765925 0.0000000
## Uk2 0.4193660 0.4487173 0.5983182 0.5381829 0.4245999 0.4586818 0.7089693
## Uk2
## M1 0.4059672
## M2 0.4191871
## M3 0.4452232
## M4 0.4895973
## M5 0.7560532
## M6 0.5714587
## Ug1 0.5117115
## Ug2 0.4193660
## Ug3 0.4487173
## Z1 0.5983182
## Z2 0.5381829
## Z3 0.4245999
## Z4 0.4586818
## Uk1 0.7089693
## Uk2 0.0000000
# Define custom colors for Snail genus
snail_genus_colors <- list(
"Snail genus" = c(
"Anisus" = "darkorange2",
"Biomphalaria" = "orange",
"Bulinus" = "red",
"Cleopatra" = "red3",
"Lanistes" = "tan4",
"Lymnaea" = "bisque3",
"Physella" = "bisque",
"Pila" = "goldenrod3"
)
)
# Make a dataset for additional ARG information
annotation <- as.data.frame(colData(tse))
annotation <- annotation[, "snail_genus", drop = FALSE]
names(annotation)[names(annotation) == "snail_genus"] <- "Snail genus"
# Heatmap
bray_curtis_plot <- pheatmap(bray_data, cluster_rows = FALSE, cluster_cols = FALSE,
annotation_col = annotation, annotation_colors = snail_genus_colors,
main = "Bray-Curtis distance"
)
# Save the plot
ggsave("bray_curtis_plot.png", bray_curtis_plot, height = 5, width = 8)
# Print the plot
print(bray_curtis_plot)

# Perform multidimensional scaling (MDS) using Bray-Curtis dissimilarity on relative abundance data
tse <- runMDS(tse, FUN = getDissimilarity, method = "bray", assay.type = "relabundance", name = "MDS_bray")
# Extract coordinates
mds_coords <- reducedDims(tse)$MDS_bray
# Create data frame for plotting
pcoa_df <- data.frame(
Axis1 = mds_coords[,1],
Axis2 = mds_coords[,2],
SampleName = rownames(mds_coords)
)
# Add administrative region to the data frame for plotting
pcoa_df$Administrative_region <- colData(tse)$Administrative_region
# Plot PCoA with sample names
pcoa_plot <- ggplot(pcoa_df, aes(x = Axis1, y = Axis2, color = Administrative_region, label = SampleName)) +
geom_point(size = 2) +
geom_text_repel(size = 2.5, max.overlaps = 20) +
theme_minimal() +
labs(
x = "PCoA Axis 1",
y = "PCoA Axis 2",
color = "Administrative region"
)
# Save the plot
ggsave("pcoa_plot.png", pcoa_plot, width = 7, height = 5)
# Print the plot
print(pcoa_plot)

ARGs
ARG load
# Calculate ARG load
colData(tse)$ARG_load <- colSums(assay(altExp(tse, "read"), "abundances"))
# Print ARG load
print(colData(tse)$ARG_load)
## M1 M2 M3 M4 M5 M6 Ug1
## 54.029774 59.618260 37.058997 65.323354 357.039501 10.657301 75.585206
## Ug2 Ug3 Z1 Z2 Z3 Z4 Uk1
## 95.780916 329.591742 47.627164 3.161198 27.483552 16.687200 80.439274
## Uk2
## 59.692572
# Calculate mean ARG load
print(mean(colData(tse)$ARG_load))
## [1] 87.98507
# Calculate median ARG load
print(median(colData(tse)$ARG_load))
## [1] 59.61826
ARG richness (number of ARG references that reads mapped to per
sample)
# Calculate ARG richness
colData(tse)$ARG_richness <- colSums(assay(altExp(tse, "read"), "pa"))
# Print ARG richness
print(colData(tse)$ARG_richness)
## M1 M2 M3 M4 M5 M6 Ug1 Ug2 Ug3 Z1 Z2 Z3 Z4 Uk1 Uk2
## 4 10 3 3 11 2 4 16 12 13 1 10 6 6 5
# Calculate mean ARG richness
print(mean(colData(tse)$ARG_richness))
## [1] 7.066667
# Calculate median ARG richness
print(median(colData(tse)$ARG_richness))
## [1] 6
Assembly-based ARG detection
# Print ARG detected by assembly-based methods
print(colData(tse)$ARGs_assembly)
## [1] 2 3 0 1 2 1 5 6 5 2 0 1 2 6 2
# Print sum of ARGs detected by assembly-based methods
print(sum((colData(tse)$ARGs_assembly)))
## [1] 38
# Calculate mean: ARGs in assembly-based detection
print(mean(colData(tse)$ARGs_assembly))
## [1] 2.533333
# Calculate variance: ARGs in assembly-based detection
print(var(colData(tse)$ARGs_assembly))
## [1] 4.12381
# Calculate median: ARGs in assembly-based detection
print(median(colData(tse)$ARGs_assembly))
## [1] 2
# Plot assembly vs read ARG detection
read_vs_assembly <- ggplot(mapping = aes(ARGs_assembly, ARG_richness), data = colData(tse)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(x = "ARGs detected by assembly", y = "ARGs detected by read mapping") +
theme_bw() +
scale_x_continuous(limits = c(-1, 17), expand = c(0, 0)) +
scale_y_continuous(limits = c(-1, 17), expand = c(0, 0)) +
coord_fixed()
# Save the plot
ggsave("read_vs_assembly.png", plot = read_vs_assembly)
## Saving 7 x 5 in image
# Print the plot
print(read_vs_assembly)

How many MAGs (and MAGs assigned to Proteobacteria) contain
ARGs?
# Print total number of ARGs in a MAG
print(sum((colData(tse)$ARGs_MAG)))
## [1] 10
# Print total number of ARGs in a Proteobacteria MAG
print(sum((colData(tse)$ARGs_Proteobacteria_MAG)))
## [1] 10
Proteobacteria vs ARG load
# Plot annotated Proteobacteria relative abundance ARG load on log scale to improve visualisation
proteo_ARG_load_annotated <- ggplot(mapping = aes(proteobacteria_relabund, ARG_load, color = colData(tse)$"Administrative_region"), data = colData(tse)) +
geom_point() +
geom_text_repel(aes(label = colData(tse)$snail_genus), fontface = "italic", color = "black") +
labs(x = "Rel. abundance of Proteobacteria", y = "ARG load",
color = "Administrative region") +
theme_bw() +
scale_x_continuous(limits = c(0, 1)) +
scale_fill_viridis_c(option = "magma")
# Save the plot
ggsave("proteo_ARG_load_annotated.png", plot = proteo_ARG_load_annotated, width = 7, height = 5)
# Print the plot
print(proteo_ARG_load_annotated)

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$ARG_load
# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
# Print correlation coefficient and its associated p-value
print(cor_test)
##
## Spearman's rank correlation rho
##
## data: variable1 and variable2
## S = 86, p-value = 6.005e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8464286
Proteobacteria vs ARG richness
# Plot Proteobacteria relative abundance vs ARG richness
proteo_ARG_richness <- ggplot(mapping = aes(proteobacteria_relabund, ARG_richness, color = colData(tse)$"Administrative_region"), data = colData(tse)) +
geom_point() +
geom_text_repel(aes(label = colData(tse)$snail_genus), fontface = "italic", color = "black") +
labs(x = "Rel. abundance of Proteobacteria", y = "ARG richness",
color = "Administrative region") +
theme_bw() +
scale_x_continuous(limits = c(0, 1)) +
scale_fill_viridis_c(option = "magma")
# Save the plot
ggsave("proteo_ARG_richness.png", plot = proteo_ARG_richness, width = 5, height = 5)
# Print the plot
print(proteo_ARG_richness)

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$ARG_richness
# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties
# Print correlation coefficient and its associated p-value
print(cor_test)
##
## Spearman's rank correlation rho
##
## data: variable1 and variable2
## S = 395.41, p-value = 0.2877
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2939087
Contributors to ARG load within the Proteobacteria
# Make a relative abundance matrix for order
relabundance_matrix_Proteobacteria_order <- assay(altExp(tse_proteobacteria, "order"), "relabundance")
# calculate the total relative abundance of each order (row sums)
total_relabundance_Proteobacteria_order <- rowSums(relabundance_matrix_Proteobacteria_order)
# Identify the top 5 Orders within Proteobacteria
top_Proteobacteria_orders <- names(sort(total_relabundance_Proteobacteria_order, decreasing = TRUE)[1:5])
# Print the top 5 Orders within Proteobacteria
print(top_Proteobacteria_orders)
## [1] "o__Burkholderiales" "o__Rhizobiales" "o__Aeromonadales"
## [4] "o__Pseudomonadales" "o__Enterobacterales"
# Plot relative abundance vs ARG load for the top 5 Orders within Proteobacteria
# Transpose the matrix so that samples are rows and orders are columns
transposed_relabundance <- t(relabundance_matrix_Proteobacteria_order)
# Ensure row names in transposed matrix match colData(tse)
rownames(transposed_relabundance) <- colnames(relabundance_matrix_Proteobacteria_order)
# Keep only the top 5 Proteobacteria orders
top_5_relabund <- transposed_relabundance[, top_Proteobacteria_orders, drop = FALSE]
# Rename columns: Remove "__o"
colnames(top_5_relabund) <- gsub("o__", "", colnames(top_5_relabund))
# Add relabundance of the top 5 orders as new columns in colData(tse)
for (order in colnames(top_5_relabund)) {
colData(tse)[[paste0(order, "_relabund")]] <- top_5_relabund[, order]
}
# Define the relative abundance columns
relabund_columns <- c("Aeromonadales_relabund", "Burkholderiales_relabund",
"Enterobacterales_relabund", "Pseudomonadales_relabund",
"Rhizobiales_relabund")
# Create a list to store the plots
plots_list <- list()
# Loop through each column and create a plot
for (col in relabund_columns) {
# Extract the order name by removing "_relabund"
order_name <- gsub("_relabund", "", col)
# Create the plot
plot <- ggplot(mapping = aes_string(x = col, y = "ARG_load", color = "colData(tse)$Administrative_region"),
data = colData(tse)) +
geom_point() +
labs(x = paste("Rel. abundance of", order_name),
y = "ARG load", color = "Administrative region") +
scale_x_continuous(limits = c(0, 1)) +
theme_bw() +
scale_fill_viridis_c(option = "magma") +
ggtitle(paste("o_", order_name)) +
theme(legend.position = "none") # Remove legend from individual plots
# Store the plot in the list
plots_list[[order_name]] <- plot
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Combine all plots into one row with a shared legend on the right
combined_order_plot <- wrap_plots(plots_list, ncol = 5) +
plot_layout(guides = "collect") # Collect legends and place one legend
ggsave("combined_order_plot.png", combined_order_plot, height = 5, width = 15)
print(combined_order_plot)

Correlation testing between Orders with mean relative abundance >
1% and ARG load
# Calculate mean relative abundance of all orders
mean_relabundance_order <- rowMeans(relabundance_matrix_order)
# Filter mean_relabundance_order to keep only values > 1%
mean_relabundance_order <- mean_relabundance_order[mean_relabundance_order > 1]
# Order alphabetically
mean_relabundance_order <- mean_relabundance_order[order(names(mean_relabundance_order))]
# Print the updated object
print(mean_relabundance_order)
## o__Aeromonadales o__Bacillales o__Burkholderiales
## 9.712867 2.363786 16.275879
## o__Caulobacterales o__Corynebacteriales o__Deinococcales
## 1.222198 3.285481 1.085390
## o__Enterobacterales o__Flavobacteriales o__Micrococcales
## 4.785705 4.075827 3.712140
## o__Micromonosporales o__Nostocales o__Propionibacteriales
## 1.060213 3.708840 1.449689
## o__Pseudomonadales o__Rhizobiales o__Rhodobacterales
## 7.112397 10.002062 3.718957
## o__Rhodocyclales o__Rhodospirillales o__Sphingomonadales
## 1.373476 1.354292 2.996436
## o__Streptomycetales o__Xanthomonadales
## 3.224572 2.735867
# Pearson correlation testing
# Prepare a dataframe for correlation testing between Orders and ARG_load
# Extract the order names from mean_relabundance_order
selected_orders <- names(mean_relabundance_order)
# Initialize a list to store results for the selected orders
order_relabund_list <- list()
# Loop through each order in mean_relabundance_order
for (order in selected_orders) {
# Subset the data for the current order
tse_order <- tse[rowData(tse)$order %in% c(order) & !is.na(rowData(tse)$order), ]
# Calculate column sums of the "relabundance" assay for the current order
order_relabund <- colSums(assay(tse_order, "relabundance"))
# Store the result in the list, with the order name as the key
order_relabund_list[[order]] <- order_relabund
}
# Combine the list into a data frame (orders as rows)
order_relabund_df <- do.call(rbind, order_relabund_list)
# Convert the list to a data frame
order_relabund_df <- as.data.frame(order_relabund_df)
# Transpose the data frame (samples as rows, order mean relative abundances as columns)
order_relabund_df <- t(order_relabund_df)
# Remove "o__" from the start of each column name
colnames(order_relabund_df) <- gsub("^o__", "", colnames(order_relabund_df))
# Access the variables
variable1 <- colData(tse)$ARG_load
variable2 <- as.numeric(order_relabund_df[, "Aeromonadales"])
variable3 <- as.numeric(order_relabund_df[, "Bacillales"])
variable4 <- as.numeric(order_relabund_df[, "Burkholderiales"])
variable5 <- as.numeric(order_relabund_df[, "Caulobacterales"])
variable6 <- as.numeric(order_relabund_df[, "Corynebacteriales"])
variable7 <- as.numeric(order_relabund_df[, "Deinococcales"])
variable8 <- as.numeric(order_relabund_df[, "Enterobacterales"])
variable9 <- as.numeric(order_relabund_df[, "Flavobacteriales"])
variable10 <- as.numeric(order_relabund_df[, "Micrococcales"])
variable11 <- as.numeric(order_relabund_df[, "Micromonosporales"])
variable12 <- as.numeric(order_relabund_df[, "Nostocales"])
variable13 <- as.numeric(order_relabund_df[, "Propionibacteriales"])
variable14 <- as.numeric(order_relabund_df[, "Pseudomonadales"])
variable15 <- as.numeric(order_relabund_df[, "Rhizobiales"])
variable16 <- as.numeric(order_relabund_df[, "Rhodobacterales"])
variable17 <- as.numeric(order_relabund_df[, "Rhodocyclales"])
variable18 <- as.numeric(order_relabund_df[, "Rhodospirillales"])
variable19 <- as.numeric(order_relabund_df[, "Sphingomonadales"])
variable20 <- as.numeric(order_relabund_df[, "Streptomycetales"])
variable21 <- as.numeric(order_relabund_df[, "Xanthomonadales"])
# Calculate Pearson's correlation coefficients and p-values
cor_test_Aeromonadales <- cor.test(variable1, variable2, method = "spearman")
cor_test_Bacillales <- cor.test(variable1, variable3, method = "spearman")
cor_test_Burkholderiales <- cor.test(variable1, variable4, method = "spearman")
cor_test_Caulobacterales <- cor.test(variable1, variable5, method = "spearman")
cor_test_Corynebacteriales <- cor.test(variable1, variable6, method = "spearman")
cor_test_Deinococcales <- cor.test(variable1, variable7, method = "spearman")
cor_test_Enterobacterales <- cor.test(variable1, variable8, method = "spearman")
cor_test_Flavobacteriales <- cor.test(variable1, variable9, method = "spearman")
cor_test_Micrococcales <- cor.test(variable1, variable10, method = "spearman")
cor_test_Micromonosporales <- cor.test(variable1, variable11, method = "spearman")
cor_test_Nostocales <- cor.test(variable1, variable12, method = "spearman")
cor_test_Propionibacteriales <- cor.test(variable1, variable13, method = "spearman")
cor_test_Pseudomonadales <- cor.test(variable1, variable14, method = "spearman")
cor_test_Rhizobiales <- cor.test(variable1, variable15, method = "spearman")
cor_test_Rhodobacterales <- cor.test(variable1, variable16, method = "spearman")
cor_test_Rhodocyclales <- cor.test(variable1, variable17, method = "spearman")
cor_test_Rhodospirillales <- cor.test(variable1, variable18, method = "spearman")
cor_test_Sphingomonadales <- cor.test(variable1, variable19, method = "spearman")
cor_test_Streptomycetales <- cor.test(variable1, variable20, method = "spearman")
cor_test_Xanthomonadales <- cor.test(variable1, variable21, method = "spearman")
# Extract the correlation coefficients and p-values from each test
cor_results <- data.frame(
Order = c("Aeromonadales", "Bacillales", "Burkholderiales", "Caulobacterales",
"Corynebacteriales", "Deinococcales", "Enterobacterales", "Flavobacteriales",
"Micrococcales", "Micromonosporales", "Nostocales", "Propionibacteriales",
"Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodocyclales",
"Rhodospirillales", "Sphingomonadales", "Streptomycetales", "Xanthomonadales"),
CorrelationCoefficient = c(cor_test_Aeromonadales$estimate, cor_test_Bacillales$estimate,
cor_test_Burkholderiales$estimate, cor_test_Caulobacterales$estimate,
cor_test_Corynebacteriales$estimate, cor_test_Deinococcales$estimate,
cor_test_Enterobacterales$estimate, cor_test_Flavobacteriales$estimate,
cor_test_Micrococcales$estimate, cor_test_Micromonosporales$estimate,
cor_test_Nostocales$estimate, cor_test_Propionibacteriales$estimate,
cor_test_Pseudomonadales$estimate, cor_test_Rhizobiales$estimate,
cor_test_Rhodobacterales$estimate, cor_test_Rhodocyclales$estimate,
cor_test_Rhodospirillales$estimate, cor_test_Sphingomonadales$estimate,
cor_test_Streptomycetales$estimate, cor_test_Xanthomonadales$estimate),
PValue = c(cor_test_Aeromonadales$p.value, cor_test_Bacillales$p.value,
cor_test_Burkholderiales$p.value, cor_test_Caulobacterales$p.value,
cor_test_Corynebacteriales$p.value, cor_test_Deinococcales$p.value,
cor_test_Enterobacterales$p.value, cor_test_Flavobacteriales$p.value,
cor_test_Micrococcales$p.value, cor_test_Micromonosporales$p.value,
cor_test_Nostocales$p.value, cor_test_Propionibacteriales$p.value,
cor_test_Pseudomonadales$p.value, cor_test_Rhizobiales$p.value,
cor_test_Rhodobacterales$p.value, cor_test_Rhodocyclales$p.value,
cor_test_Rhodospirillales$p.value, cor_test_Sphingomonadales$p.value,
cor_test_Streptomycetales$p.value, cor_test_Xanthomonadales$p.value)
)
# View the resulting data frame with correlation coefficients and p-values
print(cor_results)
## Order CorrelationCoefficient PValue
## 1 Aeromonadales 0.357142857 0.191620173
## 2 Bacillales -0.303571429 0.270772063
## 3 Burkholderiales 0.085714286 0.762997252
## 4 Caulobacterales 0.242857143 0.381991520
## 5 Corynebacteriales -0.439285714 0.103199216
## 6 Deinococcales -0.103571429 0.714423628
## 7 Enterobacterales -0.110714286 0.695276099
## 8 Flavobacteriales 0.210714286 0.449865622
## 9 Micrococcales -0.528571429 0.045425102
## 10 Micromonosporales -0.653571429 0.010031783
## 11 Nostocales -0.710714286 0.004055074
## 12 Propionibacteriales -0.664285714 0.008578169
## 13 Pseudomonadales 0.603571429 0.019540831
## 14 Rhizobiales -0.442857143 0.100186234
## 15 Rhodobacterales -0.332142857 0.226353138
## 16 Rhodocyclales -0.075000000 0.792571062
## 17 Rhodospirillales -0.478571429 0.073466155
## 18 Sphingomonadales -0.082142857 0.772822127
## 19 Streptomycetales -0.660714286 0.009043086
## 20 Xanthomonadales -0.007142857 0.984662517
Exploration of the ARGs
Antibiotic Class
altExp(tse, "read_abclass") <- agglomerateByVariable(
altExp(tse, "read"),
rowData(altExp(tse, "read"))$Class,
by = "rows",
assay.type = "abundances"
)
## Warning: 'pa' includes binary values.
## Agglomeration of it might lead to meaningless values.
## Check the assay, and consider doing transformation againmanually with agglomerated data.
# Convert to relative abundance
altExp(tse, "read_abclass") <- transformAssay(altExp(tse, "read_abclass"), assay.type = "abundances", method = "relabundance")
# Plot the abundance
ARG_class_plot <- plotAbundance(altExp(tse, "read_abclass"), assay.type = "relabundance", rank = "Class", add_x_text = TRUE)
# Save the plot
ggsave("ARG_class_plot.png", plot = ARG_class_plot, width = 7, height = 4, dpi = 600)
# Print the plot
print(ARG_class_plot)

# Make a relative abundance matrix (RPKM) for ARG class
altExp(tse, "read_abclass") <- transformAssay(altExp(tse, "read_abclass"), assay.type = "abundances", method = "relabundance")
relabundance_matrix_class <- assay(altExp(tse, "read_abclass"), "relabundance")
# Calculate the mean relative abundance of each ARG class
mean_relabundance_class <- (rowSums(relabundance_matrix_class)) / 15
# Order the ARG classes by mean relative abundance
top_classes <- names(sort(mean_relabundance_class, decreasing = TRUE)[1:14])
# Print mean ARG class relative abundance
print(mean_relabundance_class)
## Aminoglycoside Beta-lactam Beta-lactam (carbapenem)
## 0.0277312238 0.5357815461 0.1027188513
## Chloramphenicol Colistin Fosfomycin
## 0.0046715233 0.1438166043 0.0215088775
## Fusidic acid Lincosamide Macrolide
## 0.0019493327 0.0051260230 0.0214093242
## MDR (efflux pump) Quinolone Sulfonamide
## 0.0115892460 0.0003296112 0.0532232268
## Tetracycline Trimethoprim
## 0.0511944610 0.0189501488
ARG cluster90 heatmap
# Create an ARG cluster AltExp
altExp(tse, "arg_cluster") <- agglomerateByVariable(altExp(tse, "read"), rowData(altExp(tse, "read"))$Cluster90_description, by = "rows")
## Warning: 'pa' includes binary values.
## Agglomeration of it might lead to meaningless values.
## Check the assay, and consider doing transformation againmanually with agglomerated data.
# Add log abundance assay
tse_arg_cluster <- altExp(tse, "arg_cluster")
tse_arg_cluster <- transformAssay(tse_arg_cluster, assay.type = "abundances",
method = "log", pseudocount = TRUE, name = "log")
## A pseudocount of 0.137471073839662 was applied.
## Warning: non-integer data: divided by smallest positive value
# Get the assay table
mat <- assay(tse_arg_cluster, "log")
# Make a dataset for additional ARG information
arg_data <- as.data.frame(rowData(tse_arg_cluster))
arg_data <- arg_data[, "Class", drop = FALSE]
# Order the ARGs by their class
arg_data_ordered <- arg_data[order(arg_data$Class), , drop = FALSE]
# Reorder the data matrix rows according to the ordered ARGs
mat <- mat[rownames(arg_data_ordered), ]
# Define a color palette that goes from white to red
colors <- colorRampPalette(c("white", "red"))(100)
# Create the heatmap with the custom color palette
ARG_cluster_heatmap <- pheatmap(mat,
annotation_row = arg_data_ordered,
cluster_rows = FALSE, cluster_cols = FALSE,
cellwidth = 15, cellheight = 10,
color = colors)
# Save the plot
ggsave("ARG_heatmap.png", plot = ARG_cluster_heatmap, width = 7, height = 4, dpi = 600)
# Print the plot
print(ARG_cluster_heatmap)

IS elements
# Print the number of IS elements in metagenome assemblies
print(colData(tse)$IS_assembled)
## [1] 12 29 8 11 35 48 97 58 67 13 19 28 25 70 47
# Calculate mean number of IS elements in assemblies
print(mean(colData(tse)$IS_assembled))
## [1] 37.8
# Calculate median number of IS elements in assemblies
print(median(colData(tse)$IS_assembled))
## [1] 29
# Calculate no. of transposable elements per 1,000,000,000 assembled bases
colData(tse)$IS_per_Mb <- (colData(tse)$IS_assembled / colData(tse)$assembly_length) * 1000000
# Calculate no. of ARGs per 1,000,000,000 assembled bases
colData(tse)$ARGs_per_Mb <- (colData(tse)$ARGs_assembly / colData(tse)$assembly_length) * 1000000
ARGs and IS elements in MAGs
# Load MAG_data
MAG_data <- read.csv("MAG_data.csv")
MAG_AMR <- read.csv("MAG_AMR.csv")
MAG_IS_elements <- read.csv("MAG_IS_elements.csv")
# ARGs per complete genome
MAG_data$ARGs_per_genome <- MAG_data$no_ARGs/(MAG_data$Completeness/100)
# IS elements per complete genome
MAG_data$IS_per_genome <- MAG_data$no_IS/(MAG_data$Completeness/100)
# Reverse the order of Genus variable for A-> Z plotting
MAG_data <- MAG_data %>%
mutate(Genus = factor(Genus, levels = rev(sort(unique(Genus)))))
# Plot the genera of mid & high quality MAGs
MAG_taxa <- ggplot(mapping = aes(x = Genus, fill = Quality), data = MAG_data) +
geom_bar() +
theme_minimal() +
labs(y = "Count", fill = "MAG quality") +
scale_x_reverse() +
coord_flip() +
scale_fill_manual(
values = c(
"High" = "#CCCCCC", # Light Gray
"Medium" = "#666666" # Dark Gray
)
) +
scale_x_discrete(
labels = function(x) sapply(x, function(i) bquote(italic(.(i)))) # Italicize genus names
)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Save the plot
ggsave("MAG_taxa.png", plot = MAG_taxa, width = 4.5, height = 4, dpi = 300)
# Print the plot
print(MAG_taxa)

# Reverse the order of AMR determinant variable for A -> Z plotting
MAG_AMR <- MAG_AMR %>%
mutate(AMR_determinant = factor(AMR_determinant, levels = rev(sort(unique(AMR_determinant)))))
# Plot the AMR determinants in mid & high quality MAGs
MAG_AMR_plot <- ggplot(mapping = aes(x = AMR_determinant, fill = Genus), data = MAG_AMR) +
geom_bar() +
theme_minimal() +
labs(x = "AMR determinant", y = "Count") +
coord_flip() +
scale_fill_manual(
values = c(
"Aeromonas" = "#88CCEE",
"Shewanella" = "#44AA99",
"Stenotrophomonas" = "#AA4499"
),
labels = c(
"Aeromonas" = expression(italic("Aeromonas")),
"Shewanella" = expression(italic("Shewanella")),
"Stenotrophomonas" = expression(italic("Stenotrophomonas"))
)
)
# Save the plot
ggsave("MAG_AMR_plot.png", plot = MAG_AMR_plot, width = 4.5, height = 4, dpi = 300)
# Print the plot
print(MAG_AMR_plot)

# Reverse the order of IS families variable for A-> Z plotting
MAG_IS_elements <- MAG_IS_elements %>%
mutate(IS_family = factor(IS_family, levels = rev(sort(unique(IS_family)))))
# Plot the IS families in mid & high quality MAGs
MAG_IS_plot <- ggplot(mapping = aes(x = IS_family, fill = Genus), data = MAG_IS_elements) +
geom_bar() +
theme_minimal() +
labs(x = "IS family", y = "Count") +
coord_flip() +
scale_fill_manual(
values = c(
"Aeromonas" = "#88CCEE", # Muted Blue
"Aliarcobacter" = "#117733", # Deep Green
"Limnohabitans" = "#DDCC77", # Muted Yellow
"Massilia" = "#332288", # Deep Blue
"Serratia" = "#CC6677", # Muted Coral
"Shewanella" = "#44AA99", # Muted Teal
"Stenotrophomonas" = "#AA4499", # Muted Purple
"Trichormus" = "#999933" # Muted Olive
),
labels = c(
"Aeromonas" = expression(italic("Aeromonas")),
"Aliarcobacter" = expression(italic("Aliarcobacter")),
"Limnohabitans" = expression(italic("Limnohabitans")),
"Massilia" = expression(italic("Massilia")),
"Serratia" = expression(italic("Serratia")),
"Shewanella" = expression(italic("Shewanella")),
"Stenotrophomonas" = expression(italic("Stenotrophomonas")),
"Trichormus" = expression(italic("Trichormus"))
)
)
# Save the plot
ggsave("MAG_IS_plot.png", plot = MAG_IS_plot, width = 4.5, height = 4, dpi = 300)
# Print the plot
print(MAG_IS_plot)

Gammaproteobacteria vs other classes
# Gammaproteobacteria (the class within Proteobacteria from which all binned ARGs were identified) vs other bacteria
MAG_data$Gammaproteo <- ifelse(MAG_data$Class == "Gammaproteobacteria", "Yes", "No")
# ARGs per group
gammaproteo_ARGs <- MAG_data$ARGs_per_genome[MAG_data$Gammaproteo == "Yes"]
others_ARGs <- MAG_data$ARGs_per_genome[MAG_data$Gammaproteo == "No"]
# IS elements per group
gammaproteo_IS <- MAG_data$IS_per_genome[MAG_data$Gammaproteo == "Yes"]
others_IS <- MAG_data$IS_per_genome[MAG_data$Gammaproteo == "No"]
ARGs
# Print mean ARGs per complete Gammaproteobacteria genome
print(mean(gammaproteo_ARGs))
## [1] 0.6155669
# Print mean ARGs per complete 'other' genome
print(mean(others_ARGs))
## [1] 0
# Test for statistical difference (non-parametric) using Wilcoxon rank-sum test
wilcox.test(gammaproteo_ARGs, others_ARGs)
## Warning in wilcox.test.default(gammaproteo_ARGs, others_ARGs): cannot compute
## exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: gammaproteo_ARGs and others_ARGs
## W = 132, p-value = 0.03913
## alternative hypothesis: true location shift is not equal to 0
IS elements
# Print mean ARGs per complete Gammaproteobacteria genome
print(mean(gammaproteo_IS))
## [1] 1.031057
# Print mean ARGs per complete 'other' genome
print(mean(others_IS))
## [1] 0.568052
# Test for statistical difference (non-parametric) using Wilcoxon rank-sum test
wilcox.test(gammaproteo_IS, others_IS)
## Warning in wilcox.test.default(gammaproteo_IS, others_IS): cannot compute exact
## p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: gammaproteo_IS and others_IS
## W = 129.5, p-value = 0.1223
## alternative hypothesis: true location shift is not equal to 0
The effects of sequencing depth on ARG and MAG identification
MAG recovery
# Plot sequencing depth vs MAGs
depth_vs_MAGs_plot <- ggplot(mapping = aes(total_reads, mid_high_MAG), data = colData(tse)) +
geom_point() +
labs(x = "Number of reads", y = "Number of MAGs recovered") +
scale_x_continuous(labels = scales::comma, limits = c(0, 40000000)) +
scale_y_continuous(limits = c(0, 5)) +
theme_bw()
# Save the plot
ggsave("depth_vs_MAGs_plot.png", plot = depth_vs_MAGs_plot, width = 7, height = 5)
# Print the plot
print(depth_vs_MAGs_plot)

# Access variables from colData
variable1 <- colData(tse)$total_reads
variable2 <- colData(tse)$mid_high_MAG
# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties
# Print correlation coefficient and its associated p-value
print(cor_test)
##
## Spearman's rank correlation rho
##
## data: variable1 and variable2
## S = 443.29, p-value = 0.4561
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2084034
ARG richness
# Plot sequencing depth vs MAGs
depth_vs_ARGs_plot <- ggplot(mapping = aes(total_reads, ARG_richness), data = colData(tse)) +
geom_point() +
labs(x = "Number of reads", y = "Number of unique ARGs Detected") +
scale_x_continuous(labels = scales::comma, limits = c(0, 40000000)) +
scale_y_continuous(limits = c(0, 16)) +
theme_bw()
# Save the plot
ggsave("depth_vs_ARGs_plot.png", plot = depth_vs_ARGs_plot, width = 7, height = 5)
# Print the plot
print(depth_vs_ARGs_plot)

# Access variables from colData
variable1 <- colData(tse)$total_reads
variable2 <- colData(tse)$ARG_richness
# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties
# Print correlation coefficient and its associated p-value
print(cor_test)
##
## Spearman's rank correlation rho
##
## data: variable1 and variable2
## S = 401.43, p-value = 0.3065
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2831559
Rarefaction curves
# Import rarefaction data for samples with more than 20M reads following trimming and filtering
rarefaction_data <- read.csv("rarefaction_data.csv")
# Ensure Sample is a factor
rarefaction_data$Sample <- as.factor(rarefaction_data$Sample)
# Plot rarefaction curves
rarefaction_ARG_plot <- ggplot(rarefaction_data, aes(x = Depth, y = ARGs_detected, color = Sample, linetype = Sample)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
labs(
x = "Reads post-QC",
y = "Number of unique ARGs Detected"
) +
scale_linetype_manual(values = c("dotted", "twodash", "dashed", "dotdash", "twodash")) +
scale_x_continuous(labels = scales::comma, limits = c(0, 30000000)) +
scale_y_continuous(limits = c(0, 15)) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Save the plot
ggsave("rarefaction_ARG_plot.png", plot = rarefaction_ARG_plot, width = 7, height = 5)
# Print the plot
print(rarefaction_ARG_plot)
