# Import the microbiome study data object (Tree Summarized Experiment)
tse <- readRDS("O'Ferrall_snails_2024.rds")

## Attach packages
library(knitr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(mia)

## Loading required package: MultiAssayExperiment

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## Loading required package: SingleCellExperiment

## Loading required package: TreeSummarizedExperiment

## Loading required package: Biostrings

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## This is mia version 1.14.0
## - Online documentation and vignettes: https://microbiome.github.io/mia/
## - Online book 'Orchestrating Microbiome Analysis (OMA)': https://microbiome.github.io/OMA/docs/devel/

## 
## Attaching package: 'mia'

## The following objects are masked from 'package:dplyr':
## 
##     full_join, inner_join, left_join, right_join

library(miaViz)

## Loading required package: ggraph

## 
## Attaching package: 'miaViz'

## The following object is masked from 'package:mia':
## 
##     plotNMDS

library(pheatmap)
library(ggrepel)
library(patchwork)
library(scater)

## Loading required package: scuttle

Taxonomy and recovered metagenome assembled genomes (MAGs)

# Calculate the number of unique taxa for each taxonomic rank
apply(rowData(tse), 2, function (x) {length(unique(x))})

##     domain     phylum      class      order     family      genus    species 
##          1         39         73        160        352       1246       4773 
## clade_name 
##       4773

# Print total number of high-quality MAGs
print(sum(colData(tse)$high_qual_MAG))

## [1] 8

# Print total number of medium-quality MAGs
print(sum(colData(tse)$mid_qual_MAG))

## [1] 21

Phyla

# Make a relative abundance matrix for phyla
relabundance_matrix_phylum <- assay(altExp(tse, "phylum"), "relabundance")

# Calculate the total relative abundance of each phylum (row sums)
total_relabundance_phylum <- rowSums(relabundance_matrix_phylum)

# Identify the top 7 phyla (NB. these have mean relative abundance > 1%)
top_phyla <- names(sort(total_relabundance_phylum, decreasing = TRUE)[1:7])

# Make a new TSE where the phyla with relative abundance > 1% are recognized, while others are "-Other"
tse_top_7_phyla <- altExp(tse, "phylum")
rowData(tse_top_7_phyla)$phylum <- ifelse(rowData(tse_top_7_phyla)$phylum %in% top_phyla, rowData(tse_top_7_phyla)$phylum, "-Other")
rowData(tse_top_7_phyla)$Rank <- gsub("^p__", "", rowData(tse_top_7_phyla)$phylum)

# Define custom colors
custom_phyla_colors <- c(
  "Actinobacteria" = "#F8766D",
  "Bacteroidetes" = "#7CAE00",
  "Cyanobacteria" = "#00BFC4",
  "Deinococcus-Thermus" = "#C77CFF",
  "Firmicutes" = "#A0522D",
  "Planctomycetes" = "#FF69B4",
  "Proteobacteria" = "#1F4E79",
  "-Other" = "#BEBEBE"
)

# Plot the abundance
phyla_plot <- plotAbundance(tse_top_7_phyla, assay.type = "relabundance", rank = "Rank", add_x_text = TRUE) +
  scale_fill_manual(values = custom_phyla_colors, name = "Phylum") +
  scale_colour_manual(values = custom_phyla_colors, name = "Phylum") +
  guides(fill = guide_legend(override.aes = list(color = NA))) +
  theme(plot.margin = margin(t = 30, r = 10, b = 10, l = 10))

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

# Save the plot
ggsave("phyla_plot.png", plot = phyla_plot, width = 7, height = 5, dpi = 600)

# Print the plot
print(phyla_plot)

Calculate Proteobacteria mean relative abundance

# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_phylum <- relabundance_matrix_phylum * 100

# Extract Proteobacteria
proteobacteria_abundance <- relabundance_matrix_phylum["p__Proteobacteria", , drop = FALSE]

# Calculate mean Proteobacteria relative abundance
mean_proteobacteria_relabund <- sum(proteobacteria_abundance) / 15
print(mean_proteobacteria_relabund)

## [1] 66.27109

Orders

# Make a relative abundance matrix for order
relabundance_matrix_order <- assay(altExp(tse, "order"), "relabundance")

# Calculate the total relative abundance of each order (row sums)
total_relabundance_order <- rowSums(relabundance_matrix_order)

# Identify the top 20 orders (NB. these have mean relative abundance > 1%)
top_orders <- names(sort(total_relabundance_order, decreasing = TRUE)[1:20])

# Make a new TSE where the orders with relative abundance > 1% are recognized, while others are "-Other"
tse_top_20_orders <- altExp(tse, "order")
rowData(tse_top_20_orders)$order <- ifelse(rowData(tse_top_20_orders)$order %in% top_orders, rowData(tse_top_20_orders)$order, "-Other")
rowData(tse_top_20_orders)$Rank <- gsub("^o__", "", rowData(tse_top_20_orders)$order)

custom_order_colors <- c(
  "-Other" = "#BEBEBE",
  "Flavobacteriales" = "yellow",
  "Deinococcales" = "#377EB8",
  "Nostocales" = "#4DAF4A",
  "Bacillales" = "#984EA3",
  "Micromonosporales" = "#FF7F00",
  "Propionibacteriales" = "grey20",
  "Streptomycetales" = "#A65628",
  "Corynebacteriales" = "#00441B",
  "Micrococcales" = "blue",
  "Xanthomonadales" = "#66C2A5",
  "Enterobacterales" = "#FC8D62",
  "Pseudomonadales" = "#8DA0CB",
  "Aeromonadales" = "#D73027",
  "Caulobacterales" = "#A6D854",
  "Rhodospirillales" = "#E6AB02",
  "Sphingomonadales" = "#E5C494",
  "Rhodobacterales" = "#54278F",
  "Rhizobiales" = "#D95F02",
  "Rhodocyclales" = "#7570B3",
  "Burkholderiales" = "#1B9E77"
)

# Plot the abundance
order_plot <- plotAbundance(tse_top_20_orders, assay.type = "relabundance", rank = "Rank", add_x_text = TRUE) +
  scale_fill_manual(values = custom_order_colors, name = "Order") +
  scale_colour_manual(values = custom_order_colors, name = "Order") +
  guides(fill = guide_legend(override.aes = list(color = NA)))

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

# Save the plot
ggsave("order_plot.png", plot = order_plot, width = 8.2, height = 5, dpi = 600)

# Print the plot
print(order_plot)

Calculate Enterobacterales mean relative abundance

# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_order <- relabundance_matrix_order * 100

# Extract Enterobacterales
enterobacterales_abundance <- relabundance_matrix_order["o__Enterobacterales", , drop = FALSE]

# Calculate mean Enterobacterales relative abundance
mean_enterobacterales_relabund <- sum(enterobacterales_abundance) / 15
print(mean_enterobacterales_relabund)

## [1] 4.785705

Calculate Enterobacterales mean relative abundance with outlier (Uk1) removed

# Remove Uk1 from the Enterobacterales relative abundance dataframe
enterobacterales_abundance_minus_Uk1 <- as.data.frame(enterobacterales_abundance)
enterobacterales_abundance_minus_Uk1$Uk1 <- NULL

# Calculate mean Enterobacterales relative abundance without Uk1
mean_enterobacterales_relabund_minus_Uk1 <- sum(enterobacterales_abundance_minus_Uk1) / 14
print(mean_enterobacterales_relabund_minus_Uk1)

## [1] 2.14886

Exploring relative abundance of Shewanella (genus level)

# Make a relative abundance matrix for genus
relabundance_matrix_genus <- assay(altExp(tse, "genus"), "relabundance")

# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_genus <- relabundance_matrix_genus * 100

# Extract Shewanella
shewanella_abundance <- relabundance_matrix_genus["g__Shewanella", , drop = FALSE]

# Print Shewanella relabundance
print(shewanella_abundance)

##                       M1         M2         M3        M4       M5         M6
## g__Shewanella 0.07431693 0.07474352 0.09481099 0.2391963 5.065212 0.06392155
##                      Ug1       Ug2        Ug3         Z1         Z2        Z3
## g__Shewanella 0.09437259 0.2747101 0.07968018 0.05922727 0.08122704 0.1393015
##                       Z4        Uk1      Uk2
## g__Shewanella 0.08427348 0.09382548 1.328371

# Calculate mean Shewanella relative abundance
mean_shewanella_relabundance <- sum(shewanella_abundance) / 15
print(mean_shewanella_relabundance)

## [1] 0.523146

# Remove M5 and Uk2 from the Shewanella relative abundance dataframe
shewanella_abundance_minus_M5_Uk2 <- as.data.frame(shewanella_abundance)
shewanella_abundance_minus_M5_Uk2$M5 <- NULL
shewanella_abundance_minus_M5_Uk2$Uk2 <- NULL

# Calculate mean Shewanella relative abundance without M5 and Uk2 from which OXA-48-like genes were later identified
mean_shewanella_relabundance_minus_M5_Uk2 <- sum(shewanella_abundance_minus_M5_Uk2) / 13
print(mean_shewanella_relabundance_minus_M5_Uk2)

## [1] 0.1118159

Exploring relative abundance of Aeromonas (genus level)

# Extract Aeromonas
aeromonas_abundance <- relabundance_matrix_genus["g__Aeromonas", , drop = FALSE]

# Print Aeromonas relabundance
print(aeromonas_abundance)

##                    M1       M2       M3       M4       M5       M6      Ug1
## g__Aeromonas 6.476615 1.611563 6.747668 12.41048 53.81697 1.920293 3.322136
##                   Ug2      Ug3       Z1       Z2       Z3       Z4      Uk1
## g__Aeromonas 6.234634 29.57037 4.387261 3.711667 5.622518 4.802656 1.444432
##                 Uk2
## g__Aeromonas 4.5081

# Calculate mean Aeromonas relative abundance
mean_aeromonas_relabundance <- sum(aeromonas_abundance) / 15
print(mean_aeromonas_relabundance)

## [1] 9.772491

Hierarchical clustering of samples and taxa

# Obtain read count data for the top 30 Orders
tse_order <- altExp(tse, "order")

# Add clr-transformation
tse_order <- transformAssay(tse_order, assay.type = "counts", method = "relabundance")
tse_order <- transformAssay(tse_order, assay.type = "relabundance",
                              method = "clr", pseudocount = TRUE)

## A pseudocount of 4.57444733324424e-07 was applied.

# Apply a z-transformation across rows (taxa)
tse_order <- transformAssay(tse_order, assay.type = "clr", 
                              MARGIN = "features",
                              method = "standardize", name = "clr_z")

# Get 50 most abundant orders, and subsets the data by them
top_50_orders <- names(sort(total_relabundance_order, decreasing = TRUE)[1:50])
tse_order_subset <- tse_order[top_50_orders, ]

# Calculate the proportion of bacteria accounted for in the top 50 Orders
(sum(assay(tse_order_subset, "relabundance"))) / 15

## [1] 0.9792133

# Gets the assay table
mat <- assay(tse_order_subset, "clr_z")

# Make a dataframe for sample information
sample_data <- as.data.frame(colData(tse_order_subset))
sample_data <- sample_data[, c("Continent", "Administrative_region", "snail_genus"), drop = FALSE]
names(sample_data)[names(sample_data) == "snail_genus"] <- "Snail genus"
names(sample_data)[names(sample_data) == "Administrative_region"] <- "Administrative region"

# Make a dataframe for taxonomic information
taxonomic_data <- as.data.frame(rowData(tse_order_subset))
taxonomic_data <- taxonomic_data[, "phylum", drop = FALSE]
colnames(taxonomic_data)[colnames(taxonomic_data) == "phylum"] <- "Phylum"
taxonomic_data$Phylum <- sub("^p__", "", taxonomic_data$Phylum)

# Set heatmap scaling and colors
breaks <- seq(-ceiling(max(abs(mat))), ceiling(max(abs(mat))), 
              length.out = ifelse( max(abs(mat))>5, 2*ceiling(max(abs(mat))), 10 ) )
colors <- colorRampPalette(c("darkblue", "blue", "white", "red", "darkred"))(length(breaks)-1)

# Set column annotation colors
ann_colors <- list(
  "Administrative region" = c(
    "Malawi" = "forestgreen",
    "Uganda" = "yellowgreen",
    "United Kingdom" = "greenyellow",
    "Zanzibar" = "yellow2"
  ),
  Phylum = c(
    "Actinobacteria" = "#1B9E77",
    "Bacteroidetes" = "#66C2A5",
    "Cyanobacteria" = "#3288BD",
    "Deinococcus-Thermus" = "#5E4FA2",
    "Firmicutes" = "#ABDDA4",
    "Planctomycetes" = "#A6CEE3",
    "Proteobacteria" = "#8DA0CB",
    "Tenericutes" = "#E0F3F8",
    "Verrucomicrobia" = "#00441B"
  ),
  "Snail genus" = c(
    "Anisus" = "darkorange2",
    "Biomphalaria" = "orange",
    "Bulinus" = "red",
    "Cleopatra" = "red3",
    "Lanistes" = "tan4",
    "Lymnaea" = "bisque3",
    "Physella" = "bisque",
    "Pila" = "goldenrod3"
  ),
  Continent = c(
    "Africa" = "grey90",
    "Europe" = "grey35"
  )
)

# Create the heatmap
order_clusters <- pheatmap(mat, annotation_col = sample_data, annotation_row = taxonomic_data, annotation_colors = ann_colors, color = colors, breaks = breaks, legend_width = 0.5)

# Save the plot
ggsave("order_clusters.png", plot = order_clusters, width = 10, height = 8, dpi = 600)

# Print plot
print(order_clusters)

Diversity

Alpha diversity

# Calculate diversity statistics
tse <- addAlpha(tse, index = "shannon")
tse <- addAlpha(tse, index = "gini_simpson")

# Print Shannon's diversity
print(colData(tse)$shannon)

##  [1] 7.039099 6.962225 7.282968 6.887802 3.881060 5.815881 6.333669 6.445851
##  [9] 6.051164 6.954845 7.372464 7.245500 7.300361 4.835209 6.701276

# Print mean Shannon's diversity
print(mean(colData(tse)$shannon))

## [1] 6.473958

# Print Shannon's diversity median
print(median(colData(tse)$shannon))

## [1] 6.887802

# Print Simpson's diversity
print(colData(tse)$gini_simpson)

##  [1] 0.9964133 0.9951006 0.9969621 0.9914432 0.7925213 0.9738414 0.9864923
##  [8] 0.9911865 0.9715998 0.9939893 0.9986682 0.9978338 0.9972464 0.8985658
## [15] 0.9941253

# Print mean Simpson's diversity
print(mean(colData(tse)$gini_simpson))

## [1] 0.9717326

# Print Simpson's diversity median
print(median(colData(tse)$gini_simpson))

## [1] 0.9939893

Proteobacteria vs alpha-diversity

# Add Proteobacteria relative abundance to colData
tse_proteobacteria <- tse[rowData(tse)$phylum %in% c("p__Proteobacteria") & !is.na(rowData(tse)$phylum), ]
proteobacteria_relabund <- colSums(assay(tse_proteobacteria, "relabundance"))
colData(tse)$proteobacteria_relabund <- proteobacteria_relabund

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$shannon

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

# Print correlation coefficient and its associated p-value
print(cor_test)

## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 916, p-value = 0.01287
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6357143

Beta diversity

# Generate beta-diversity statistics (Bray-Curtis distance)
tse <- addDissimilarity(tse, assay.type = "relabundance" , method = "bray")
bray_data <- metadata(tse)[["bray"]]

# Print values
print(bray_data)

##            M1        M2        M3        M4        M5        M6       Ug1
## M1  0.0000000 0.3950959 0.2578856 0.3099759 0.7660522 0.5444646 0.4956978
## M2  0.3950959 0.0000000 0.4055159 0.4547122 0.7835672 0.5872936 0.4744968
## M3  0.2578856 0.4055159 0.0000000 0.2293442 0.7544281 0.5129921 0.5095420
## M4  0.3099759 0.4547122 0.2293442 0.0000000 0.7060798 0.5080983 0.5440057
## M5  0.7660522 0.7835672 0.7544281 0.7060798 0.0000000 0.8186170 0.7946426
## M6  0.5444646 0.5872936 0.5129921 0.5080983 0.8186170 0.0000000 0.6178323
## Ug1 0.4956978 0.4744968 0.5095420 0.5440057 0.7946426 0.6178323 0.0000000
## Ug2 0.3837108 0.4853790 0.4557216 0.4802522 0.7355626 0.5711390 0.4613960
## Ug3 0.3780083 0.4766061 0.3650904 0.3433664 0.6036607 0.5409692 0.5308463
## Z1  0.4998875 0.5227628 0.4218702 0.4748654 0.7826303 0.6387735 0.5707443
## Z2  0.4271759 0.4772067 0.3131357 0.3968051 0.8023832 0.6310114 0.5866067
## Z3  0.3362135 0.4131174 0.2451802 0.3423994 0.7357711 0.5558053 0.5162808
## Z4  0.3727922 0.3795538 0.2517945 0.3499632 0.7418444 0.5907308 0.5478100
## Uk1 0.7194104 0.7556882 0.7674740 0.7746296 0.8438423 0.7702989 0.7174539
## Uk2 0.4059672 0.4191871 0.4452232 0.4895973 0.7560532 0.5714587 0.5117115
##           Ug2       Ug3        Z1        Z2        Z3        Z4       Uk1
## M1  0.3837108 0.3780083 0.4998875 0.4271759 0.3362135 0.3727922 0.7194104
## M2  0.4853790 0.4766061 0.5227628 0.4772067 0.4131174 0.3795538 0.7556882
## M3  0.4557216 0.3650904 0.4218702 0.3131357 0.2451802 0.2517945 0.7674740
## M4  0.4802522 0.3433664 0.4748654 0.3968051 0.3423994 0.3499632 0.7746296
## M5  0.7355626 0.6036607 0.7826303 0.8023832 0.7357711 0.7418444 0.8438423
## M6  0.5711390 0.5409692 0.6387735 0.6310114 0.5558053 0.5907308 0.7702989
## Ug1 0.4613960 0.5308463 0.5707443 0.5866067 0.5162808 0.5478100 0.7174539
## Ug2 0.0000000 0.4380097 0.5745889 0.5747490 0.4334179 0.5110393 0.7113661
## Ug3 0.4380097 0.0000000 0.5670455 0.5193736 0.4169732 0.4651173 0.7617499
## Z1  0.5745889 0.5670455 0.0000000 0.3211763 0.3837895 0.4210765 0.8087910
## Z2  0.5747490 0.5193736 0.3211763 0.0000000 0.2714854 0.2517693 0.8068478
## Z3  0.4334179 0.4169732 0.3837895 0.2714854 0.0000000 0.2320110 0.7428368
## Z4  0.5110393 0.4651173 0.4210765 0.2517693 0.2320110 0.0000000 0.7765925
## Uk1 0.7113661 0.7617499 0.8087910 0.8068478 0.7428368 0.7765925 0.0000000
## Uk2 0.4193660 0.4487173 0.5983182 0.5381829 0.4245999 0.4586818 0.7089693
##           Uk2
## M1  0.4059672
## M2  0.4191871
## M3  0.4452232
## M4  0.4895973
## M5  0.7560532
## M6  0.5714587
## Ug1 0.5117115
## Ug2 0.4193660
## Ug3 0.4487173
## Z1  0.5983182
## Z2  0.5381829
## Z3  0.4245999
## Z4  0.4586818
## Uk1 0.7089693
## Uk2 0.0000000

# Define custom colors for Snail genus
snail_genus_colors <- list(
 "Snail genus" = c(
    "Anisus" = "darkorange2",
    "Biomphalaria" = "orange",
    "Bulinus" = "red",
    "Cleopatra" = "red3",
    "Lanistes" = "tan4",
    "Lymnaea" = "bisque3",
    "Physella" = "bisque",
    "Pila" = "goldenrod3"
  )
)

# Make a dataset for additional ARG information
annotation <- as.data.frame(colData(tse))
annotation <- annotation[, "snail_genus", drop = FALSE]
names(annotation)[names(annotation) == "snail_genus"] <- "Snail genus"

# Heatmap
bray_curtis_plot <- pheatmap(bray_data, cluster_rows = FALSE, cluster_cols = FALSE,
  annotation_col = annotation, annotation_colors = snail_genus_colors,
  main = "Bray-Curtis distance"
)

# Save the plot
ggsave("bray_curtis_plot.png", bray_curtis_plot, height = 5, width = 8)

# Print the plot
print(bray_curtis_plot)

# Perform multidimensional scaling (MDS) using Bray-Curtis dissimilarity on relative abundance data
tse <- runMDS(tse, FUN = getDissimilarity, method = "bray", assay.type = "relabundance", name = "MDS_bray")

# Extract coordinates
mds_coords <- reducedDims(tse)$MDS_bray

# Create data frame for plotting
pcoa_df <- data.frame(
  Axis1 = mds_coords[,1],
  Axis2 = mds_coords[,2],
  SampleName = rownames(mds_coords)
)

# Add administrative region to the data frame for plotting
pcoa_df$Administrative_region <- colData(tse)$Administrative_region

# Plot PCoA with sample names
pcoa_plot <- ggplot(pcoa_df, aes(x = Axis1, y = Axis2, color = Administrative_region, label = SampleName)) +
  geom_point(size = 2) +
  geom_text_repel(size = 2.5, max.overlaps = 20) +
  theme_minimal() +
  labs(
    x = "PCoA Axis 1",
    y = "PCoA Axis 2",
    color = "Administrative region" 
  )

# Save the plot
ggsave("pcoa_plot.png", pcoa_plot, width = 7, height = 5)

# Print the plot
print(pcoa_plot)

ARGs

ARG load

# Calculate ARG load
colData(tse)$ARG_load <- colSums(assay(altExp(tse, "read"), "abundances"))

# Print ARG load
print(colData(tse)$ARG_load)

##         M1         M2         M3         M4         M5         M6        Ug1 
##  54.029774  59.618260  37.058997  65.323354 357.039501  10.657301  75.585206 
##        Ug2        Ug3         Z1         Z2         Z3         Z4        Uk1 
##  95.780916 329.591742  47.627164   3.161198  27.483552  16.687200  80.439274 
##        Uk2 
##  59.692572

# Calculate mean ARG load
print(mean(colData(tse)$ARG_load))

## [1] 87.98507

# Calculate median ARG load
print(median(colData(tse)$ARG_load))

## [1] 59.61826

ARG richness (number of ARG references that reads mapped to per sample)

# Calculate ARG richness
colData(tse)$ARG_richness <- colSums(assay(altExp(tse, "read"), "pa"))

# Print ARG richness
print(colData(tse)$ARG_richness)

##  M1  M2  M3  M4  M5  M6 Ug1 Ug2 Ug3  Z1  Z2  Z3  Z4 Uk1 Uk2 
##   4  10   3   3  11   2   4  16  12  13   1  10   6   6   5

# Calculate mean ARG richness
print(mean(colData(tse)$ARG_richness))

## [1] 7.066667

# Calculate median ARG richness
print(median(colData(tse)$ARG_richness))

## [1] 6

Assembly-based ARG detection

# Print ARG detected by assembly-based methods
print(colData(tse)$ARGs_assembly)

##  [1] 2 3 0 1 2 1 5 6 5 2 0 1 2 6 2

# Print sum of ARGs detected by assembly-based methods
print(sum((colData(tse)$ARGs_assembly)))

## [1] 38

# Calculate mean: ARGs in assembly-based detection
print(mean(colData(tse)$ARGs_assembly))

## [1] 2.533333

# Calculate variance: ARGs in assembly-based detection
print(var(colData(tse)$ARGs_assembly))

## [1] 4.12381

# Calculate median: ARGs in assembly-based detection
print(median(colData(tse)$ARGs_assembly))

## [1] 2

# Plot assembly vs read ARG detection
read_vs_assembly <- ggplot(mapping = aes(ARGs_assembly, ARG_richness), data = colData(tse)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(x = "ARGs detected by assembly", y = "ARGs detected by read mapping") +
  theme_bw() +
  scale_x_continuous(limits = c(-1, 17), expand = c(0, 0)) +
  scale_y_continuous(limits = c(-1, 17), expand = c(0, 0)) +
  coord_fixed()

# Save the plot
ggsave("read_vs_assembly.png", plot = read_vs_assembly)

## Saving 7 x 5 in image

# Print the plot
print(read_vs_assembly)

How many MAGs (and MAGs assigned to Proteobacteria) contain ARGs?

# Print total number of ARGs in a MAG
print(sum((colData(tse)$ARGs_MAG)))

## [1] 10

# Print total number of ARGs in a Proteobacteria MAG
print(sum((colData(tse)$ARGs_Proteobacteria_MAG)))

## [1] 10

Proteobacteria vs ARG load

# Plot annotated Proteobacteria relative abundance ARG load on log scale to improve visualisation
proteo_ARG_load_annotated <- ggplot(mapping = aes(proteobacteria_relabund, ARG_load, color = colData(tse)$"Administrative_region"), data = colData(tse)) +
  geom_point() +
  geom_text_repel(aes(label = colData(tse)$snail_genus), fontface = "italic", color = "black") +
  labs(x = "Rel. abundance of Proteobacteria", y = "ARG load",
       color = "Administrative region") +
  theme_bw() +
  scale_x_continuous(limits = c(0, 1)) +
  scale_fill_viridis_c(option = "magma")

# Save the plot
ggsave("proteo_ARG_load_annotated.png", plot = proteo_ARG_load_annotated, width = 7, height = 5)

# Print the plot
print(proteo_ARG_load_annotated)

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$ARG_load

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

# Print correlation coefficient and its associated p-value
print(cor_test)

## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 86, p-value = 6.005e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8464286

Proteobacteria vs ARG richness

# Plot Proteobacteria relative abundance vs ARG richness
proteo_ARG_richness <- ggplot(mapping = aes(proteobacteria_relabund, ARG_richness, color = colData(tse)$"Administrative_region"), data = colData(tse)) +
  geom_point() +
  geom_text_repel(aes(label = colData(tse)$snail_genus), fontface = "italic", color = "black") +
  labs(x = "Rel. abundance of Proteobacteria", y = "ARG richness",
       color = "Administrative region") +
  theme_bw() +
  scale_x_continuous(limits = c(0, 1)) +
  scale_fill_viridis_c(option = "magma")

# Save the plot
ggsave("proteo_ARG_richness.png", plot = proteo_ARG_richness, width = 5, height = 5)

# Print the plot
print(proteo_ARG_richness)

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$ARG_richness

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties

# Print correlation coefficient and its associated p-value
print(cor_test)

## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 395.41, p-value = 0.2877
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2939087

Contributors to ARG load within the Proteobacteria

# Make a relative abundance matrix for order
relabundance_matrix_Proteobacteria_order <- assay(altExp(tse_proteobacteria, "order"), "relabundance")

# calculate the total relative abundance of each order (row sums)
total_relabundance_Proteobacteria_order <- rowSums(relabundance_matrix_Proteobacteria_order)

# Identify the top 5 Orders within Proteobacteria
top_Proteobacteria_orders <- names(sort(total_relabundance_Proteobacteria_order, decreasing = TRUE)[1:5])

# Print the top 5 Orders within Proteobacteria
print(top_Proteobacteria_orders)

## [1] "o__Burkholderiales"  "o__Rhizobiales"      "o__Aeromonadales"   
## [4] "o__Pseudomonadales"  "o__Enterobacterales"

# Plot relative abundance vs ARG load for the top 5 Orders within Proteobacteria

# Transpose the matrix so that samples are rows and orders are columns
transposed_relabundance <- t(relabundance_matrix_Proteobacteria_order)

# Ensure row names in transposed matrix match colData(tse)
rownames(transposed_relabundance) <- colnames(relabundance_matrix_Proteobacteria_order)

# Keep only the top 5 Proteobacteria orders
top_5_relabund <- transposed_relabundance[, top_Proteobacteria_orders, drop = FALSE]

# Rename columns: Remove "__o"
colnames(top_5_relabund) <- gsub("o__", "", colnames(top_5_relabund))

# Add relabundance of the top 5 orders as new columns in colData(tse)
for (order in colnames(top_5_relabund)) {
  colData(tse)[[paste0(order, "_relabund")]] <- top_5_relabund[, order]
}

# Define the relative abundance columns
relabund_columns <- c("Aeromonadales_relabund", "Burkholderiales_relabund",
                      "Enterobacterales_relabund", "Pseudomonadales_relabund",
                      "Rhizobiales_relabund")

# Create a list to store the plots
plots_list <- list()

# Loop through each column and create a plot
for (col in relabund_columns) {
  
  # Extract the order name by removing "_relabund"
  order_name <- gsub("_relabund", "", col)
  
  # Create the plot
  plot <- ggplot(mapping = aes_string(x = col, y = "ARG_load", color = "colData(tse)$Administrative_region"), 
                 data = colData(tse)) +
    geom_point() +
    labs(x = paste("Rel. abundance of", order_name), 
         y = "ARG load", color = "Administrative region") +
    scale_x_continuous(limits = c(0, 1)) +
    theme_bw() +
    scale_fill_viridis_c(option = "magma") +
    ggtitle(paste("o_", order_name)) +
    theme(legend.position = "none")  # Remove legend from individual plots
  
  # Store the plot in the list
  plots_list[[order_name]] <- plot
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Combine all plots into one row with a shared legend on the right
combined_order_plot <- wrap_plots(plots_list, ncol = 5) + 
  plot_layout(guides = "collect")  # Collect legends and place one legend

ggsave("combined_order_plot.png", combined_order_plot, height = 5, width = 15)

print(combined_order_plot)

Correlation testing between Orders with mean relative abundance > 1% and ARG load

# Calculate mean relative abundance of all orders
mean_relabundance_order <- rowMeans(relabundance_matrix_order)

# Filter mean_relabundance_order to keep only values > 1%
mean_relabundance_order <- mean_relabundance_order[mean_relabundance_order > 1]

# Order alphabetically
mean_relabundance_order <- mean_relabundance_order[order(names(mean_relabundance_order))]

# Print the updated object
print(mean_relabundance_order)

##       o__Aeromonadales          o__Bacillales     o__Burkholderiales 
##               9.712867               2.363786              16.275879 
##     o__Caulobacterales   o__Corynebacteriales       o__Deinococcales 
##               1.222198               3.285481               1.085390 
##    o__Enterobacterales    o__Flavobacteriales       o__Micrococcales 
##               4.785705               4.075827               3.712140 
##   o__Micromonosporales          o__Nostocales o__Propionibacteriales 
##               1.060213               3.708840               1.449689 
##     o__Pseudomonadales         o__Rhizobiales     o__Rhodobacterales 
##               7.112397              10.002062               3.718957 
##       o__Rhodocyclales    o__Rhodospirillales    o__Sphingomonadales 
##               1.373476               1.354292               2.996436 
##    o__Streptomycetales     o__Xanthomonadales 
##               3.224572               2.735867

# Pearson correlation testing

# Prepare a dataframe for correlation testing between Orders and ARG_load

# Extract the order names from mean_relabundance_order
selected_orders <- names(mean_relabundance_order)

# Initialize a list to store results for the selected orders
order_relabund_list <- list()

# Loop through each order in mean_relabundance_order
for (order in selected_orders) {
  # Subset the data for the current order
  tse_order <- tse[rowData(tse)$order %in% c(order) & !is.na(rowData(tse)$order), ]
  
  # Calculate column sums of the "relabundance" assay for the current order
  order_relabund <- colSums(assay(tse_order, "relabundance"))
  
  # Store the result in the list, with the order name as the key
  order_relabund_list[[order]] <- order_relabund
}

# Combine the list into a data frame (orders as rows)
order_relabund_df <- do.call(rbind, order_relabund_list)

# Convert the list to a data frame
order_relabund_df <- as.data.frame(order_relabund_df)

# Transpose the data frame (samples as rows, order mean relative abundances as columns)
order_relabund_df <- t(order_relabund_df)

# Remove "o__" from the start of each column name
colnames(order_relabund_df) <- gsub("^o__", "", colnames(order_relabund_df))

# Access the variables
variable1 <- colData(tse)$ARG_load
variable2 <- as.numeric(order_relabund_df[, "Aeromonadales"])
variable3 <- as.numeric(order_relabund_df[, "Bacillales"])
variable4 <- as.numeric(order_relabund_df[, "Burkholderiales"])
variable5 <- as.numeric(order_relabund_df[, "Caulobacterales"])
variable6 <- as.numeric(order_relabund_df[, "Corynebacteriales"])
variable7 <- as.numeric(order_relabund_df[, "Deinococcales"])
variable8 <- as.numeric(order_relabund_df[, "Enterobacterales"])
variable9 <- as.numeric(order_relabund_df[, "Flavobacteriales"])
variable10 <- as.numeric(order_relabund_df[, "Micrococcales"])
variable11 <- as.numeric(order_relabund_df[, "Micromonosporales"])
variable12 <- as.numeric(order_relabund_df[, "Nostocales"])
variable13 <- as.numeric(order_relabund_df[, "Propionibacteriales"])
variable14 <- as.numeric(order_relabund_df[, "Pseudomonadales"])
variable15 <- as.numeric(order_relabund_df[, "Rhizobiales"])
variable16 <- as.numeric(order_relabund_df[, "Rhodobacterales"])
variable17 <- as.numeric(order_relabund_df[, "Rhodocyclales"])
variable18 <- as.numeric(order_relabund_df[, "Rhodospirillales"])
variable19 <- as.numeric(order_relabund_df[, "Sphingomonadales"])
variable20 <- as.numeric(order_relabund_df[, "Streptomycetales"])
variable21 <- as.numeric(order_relabund_df[, "Xanthomonadales"])

# Calculate Pearson's correlation coefficients and p-values
cor_test_Aeromonadales <- cor.test(variable1, variable2, method = "spearman")
cor_test_Bacillales <- cor.test(variable1, variable3, method = "spearman")
cor_test_Burkholderiales <- cor.test(variable1, variable4, method = "spearman")
cor_test_Caulobacterales <- cor.test(variable1, variable5, method = "spearman")
cor_test_Corynebacteriales <- cor.test(variable1, variable6, method = "spearman")
cor_test_Deinococcales <- cor.test(variable1, variable7, method = "spearman")
cor_test_Enterobacterales <- cor.test(variable1, variable8, method = "spearman")
cor_test_Flavobacteriales <- cor.test(variable1, variable9, method = "spearman")
cor_test_Micrococcales <- cor.test(variable1, variable10, method = "spearman")
cor_test_Micromonosporales <- cor.test(variable1, variable11, method = "spearman")
cor_test_Nostocales <- cor.test(variable1, variable12, method = "spearman")
cor_test_Propionibacteriales <- cor.test(variable1, variable13, method = "spearman")
cor_test_Pseudomonadales <- cor.test(variable1, variable14, method = "spearman")
cor_test_Rhizobiales <- cor.test(variable1, variable15, method = "spearman")
cor_test_Rhodobacterales <- cor.test(variable1, variable16, method = "spearman")
cor_test_Rhodocyclales <- cor.test(variable1, variable17, method = "spearman")
cor_test_Rhodospirillales <- cor.test(variable1, variable18, method = "spearman")
cor_test_Sphingomonadales <- cor.test(variable1, variable19, method = "spearman")
cor_test_Streptomycetales <- cor.test(variable1, variable20, method = "spearman")
cor_test_Xanthomonadales <- cor.test(variable1, variable21, method = "spearman")

# Extract the correlation coefficients and p-values from each test
cor_results <- data.frame(
  Order = c("Aeromonadales", "Bacillales", "Burkholderiales", "Caulobacterales", 
            "Corynebacteriales", "Deinococcales", "Enterobacterales", "Flavobacteriales", 
            "Micrococcales", "Micromonosporales", "Nostocales", "Propionibacteriales", 
            "Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodocyclales", 
            "Rhodospirillales", "Sphingomonadales", "Streptomycetales", "Xanthomonadales"),
  CorrelationCoefficient = c(cor_test_Aeromonadales$estimate, cor_test_Bacillales$estimate, 
                            cor_test_Burkholderiales$estimate, cor_test_Caulobacterales$estimate, 
                            cor_test_Corynebacteriales$estimate, cor_test_Deinococcales$estimate, 
                            cor_test_Enterobacterales$estimate, cor_test_Flavobacteriales$estimate, 
                            cor_test_Micrococcales$estimate, cor_test_Micromonosporales$estimate, 
                            cor_test_Nostocales$estimate, cor_test_Propionibacteriales$estimate, 
                            cor_test_Pseudomonadales$estimate, cor_test_Rhizobiales$estimate, 
                            cor_test_Rhodobacterales$estimate, cor_test_Rhodocyclales$estimate, 
                            cor_test_Rhodospirillales$estimate, cor_test_Sphingomonadales$estimate, 
                            cor_test_Streptomycetales$estimate, cor_test_Xanthomonadales$estimate),
  PValue = c(cor_test_Aeromonadales$p.value, cor_test_Bacillales$p.value, 
             cor_test_Burkholderiales$p.value, cor_test_Caulobacterales$p.value, 
             cor_test_Corynebacteriales$p.value, cor_test_Deinococcales$p.value, 
             cor_test_Enterobacterales$p.value, cor_test_Flavobacteriales$p.value, 
             cor_test_Micrococcales$p.value, cor_test_Micromonosporales$p.value, 
             cor_test_Nostocales$p.value, cor_test_Propionibacteriales$p.value, 
             cor_test_Pseudomonadales$p.value, cor_test_Rhizobiales$p.value, 
             cor_test_Rhodobacterales$p.value, cor_test_Rhodocyclales$p.value, 
             cor_test_Rhodospirillales$p.value, cor_test_Sphingomonadales$p.value, 
             cor_test_Streptomycetales$p.value, cor_test_Xanthomonadales$p.value)
)

# View the resulting data frame with correlation coefficients and p-values
print(cor_results)

##                  Order CorrelationCoefficient      PValue
## 1        Aeromonadales            0.357142857 0.191620173
## 2           Bacillales           -0.303571429 0.270772063
## 3      Burkholderiales            0.085714286 0.762997252
## 4      Caulobacterales            0.242857143 0.381991520
## 5    Corynebacteriales           -0.439285714 0.103199216
## 6        Deinococcales           -0.103571429 0.714423628
## 7     Enterobacterales           -0.110714286 0.695276099
## 8     Flavobacteriales            0.210714286 0.449865622
## 9        Micrococcales           -0.528571429 0.045425102
## 10   Micromonosporales           -0.653571429 0.010031783
## 11          Nostocales           -0.710714286 0.004055074
## 12 Propionibacteriales           -0.664285714 0.008578169
## 13     Pseudomonadales            0.603571429 0.019540831
## 14         Rhizobiales           -0.442857143 0.100186234
## 15     Rhodobacterales           -0.332142857 0.226353138
## 16       Rhodocyclales           -0.075000000 0.792571062
## 17    Rhodospirillales           -0.478571429 0.073466155
## 18    Sphingomonadales           -0.082142857 0.772822127
## 19    Streptomycetales           -0.660714286 0.009043086
## 20     Xanthomonadales           -0.007142857 0.984662517

Exploration of the ARGs

Antibiotic Class

altExp(tse, "read_abclass") <- agglomerateByVariable(
  altExp(tse, "read"),
  rowData(altExp(tse, "read"))$Class,
  by = "rows",
  assay.type = "abundances"
)

## Warning: 'pa' includes binary values.
## Agglomeration of it might lead to meaningless values.
## Check the assay, and consider doing transformation againmanually with agglomerated data.

# Convert to relative abundance
altExp(tse, "read_abclass") <- transformAssay(altExp(tse, "read_abclass"), assay.type = "abundances", method = "relabundance")

# Plot the abundance
ARG_class_plot <- plotAbundance(altExp(tse, "read_abclass"), assay.type = "relabundance", rank = "Class", add_x_text = TRUE)

# Save the plot
ggsave("ARG_class_plot.png", plot = ARG_class_plot, width = 7, height = 4, dpi = 600)

# Print the plot
print(ARG_class_plot)

# Make a relative abundance matrix (RPKM) for ARG class
altExp(tse, "read_abclass") <- transformAssay(altExp(tse, "read_abclass"), assay.type = "abundances", method = "relabundance")
relabundance_matrix_class <- assay(altExp(tse, "read_abclass"), "relabundance")

# Calculate the mean relative abundance of each ARG class
mean_relabundance_class <- (rowSums(relabundance_matrix_class)) / 15

# Order the ARG classes by mean relative abundance
top_classes <- names(sort(mean_relabundance_class, decreasing = TRUE)[1:14])

# Print mean ARG class relative abundance
print(mean_relabundance_class)

##           Aminoglycoside              Beta-lactam Beta-lactam (carbapenem) 
##             0.0277312238             0.5357815461             0.1027188513 
##          Chloramphenicol                 Colistin               Fosfomycin 
##             0.0046715233             0.1438166043             0.0215088775 
##             Fusidic acid              Lincosamide                Macrolide 
##             0.0019493327             0.0051260230             0.0214093242 
##        MDR (efflux pump)                Quinolone              Sulfonamide 
##             0.0115892460             0.0003296112             0.0532232268 
##             Tetracycline             Trimethoprim 
##             0.0511944610             0.0189501488

ARG cluster90 heatmap

# Create an ARG cluster AltExp
altExp(tse, "arg_cluster") <- agglomerateByVariable(altExp(tse, "read"), rowData(altExp(tse, "read"))$Cluster90_description, by = "rows")

## Warning: 'pa' includes binary values.
## Agglomeration of it might lead to meaningless values.
## Check the assay, and consider doing transformation againmanually with agglomerated data.

# Add log abundance assay
tse_arg_cluster <- altExp(tse, "arg_cluster")
tse_arg_cluster <- transformAssay(tse_arg_cluster, assay.type = "abundances",
                                  method = "log", pseudocount = TRUE, name = "log")

## A pseudocount of 0.137471073839662 was applied.

## Warning: non-integer data: divided by smallest positive value

# Get the assay table
mat <- assay(tse_arg_cluster, "log")

# Make a dataset for additional ARG information
arg_data <- as.data.frame(rowData(tse_arg_cluster))
arg_data <- arg_data[, "Class", drop = FALSE]

# Order the ARGs by their class
arg_data_ordered <- arg_data[order(arg_data$Class), , drop = FALSE]

# Reorder the data matrix rows according to the ordered ARGs
mat <- mat[rownames(arg_data_ordered), ]

# Define a color palette that goes from white to red
colors <- colorRampPalette(c("white", "red"))(100)

# Create the heatmap with the custom color palette
ARG_cluster_heatmap <- pheatmap(mat, 
                                annotation_row = arg_data_ordered, 
                                cluster_rows = FALSE, cluster_cols = FALSE, 
                                cellwidth = 15, cellheight = 10,
                                color = colors)
# Save the plot
ggsave("ARG_heatmap.png", plot = ARG_cluster_heatmap, width = 7, height = 4, dpi = 600)

# Print the plot
print(ARG_cluster_heatmap)

IS elements

# Print the number of IS elements in metagenome assemblies
print(colData(tse)$IS_assembled)

##  [1] 12 29  8 11 35 48 97 58 67 13 19 28 25 70 47

# Calculate mean number of IS elements in assemblies
print(mean(colData(tse)$IS_assembled))

## [1] 37.8

# Calculate median  number of IS elements in assemblies
print(median(colData(tse)$IS_assembled))

## [1] 29

# Calculate no. of transposable elements per 1,000,000,000 assembled bases
colData(tse)$IS_per_Mb <- (colData(tse)$IS_assembled / colData(tse)$assembly_length) * 1000000

# Calculate no. of ARGs per 1,000,000,000 assembled bases
colData(tse)$ARGs_per_Mb <- (colData(tse)$ARGs_assembly / colData(tse)$assembly_length) * 1000000

ARGs and IS elements in MAGs

# Load MAG_data
MAG_data <- read.csv("MAG_data.csv")
MAG_AMR <- read.csv("MAG_AMR.csv")
MAG_IS_elements <- read.csv("MAG_IS_elements.csv")

# ARGs per complete genome
MAG_data$ARGs_per_genome <- MAG_data$no_ARGs/(MAG_data$Completeness/100)

# IS elements per complete genome
MAG_data$IS_per_genome <- MAG_data$no_IS/(MAG_data$Completeness/100)

# Reverse the order of Genus variable for A-> Z plotting
MAG_data <- MAG_data %>%
  mutate(Genus = factor(Genus, levels = rev(sort(unique(Genus)))))

# Plot the genera of mid & high quality MAGs
MAG_taxa <- ggplot(mapping = aes(x = Genus, fill = Quality), data = MAG_data) +
  geom_bar() +
  theme_minimal() +
  labs(y = "Count", fill = "MAG quality") +
  scale_x_reverse() +
  coord_flip() +
  scale_fill_manual(
    values = c(
      "High" = "#CCCCCC",  # Light Gray
      "Medium" = "#666666"    # Dark Gray
    )
  ) +
  scale_x_discrete(
    labels = function(x) sapply(x, function(i) bquote(italic(.(i)))) # Italicize genus names
  )

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

# Save the plot
ggsave("MAG_taxa.png", plot = MAG_taxa, width = 4.5, height = 4, dpi = 300)

# Print the plot
print(MAG_taxa)

# Reverse the order of AMR determinant variable for A -> Z plotting
MAG_AMR <- MAG_AMR %>%
  mutate(AMR_determinant = factor(AMR_determinant, levels = rev(sort(unique(AMR_determinant)))))

# Plot the AMR determinants in mid & high quality MAGs
MAG_AMR_plot <- ggplot(mapping = aes(x = AMR_determinant, fill = Genus), data = MAG_AMR) +
  geom_bar() +
  theme_minimal() +
  labs(x = "AMR determinant", y = "Count") +
  coord_flip() +
  scale_fill_manual(
    values = c(
      "Aeromonas" = "#88CCEE",
      "Shewanella" = "#44AA99",
      "Stenotrophomonas" = "#AA4499"
    ),
    labels = c(
      "Aeromonas" = expression(italic("Aeromonas")),
      "Shewanella" = expression(italic("Shewanella")),
      "Stenotrophomonas" = expression(italic("Stenotrophomonas"))
    )
  )


# Save the plot
ggsave("MAG_AMR_plot.png", plot = MAG_AMR_plot, width = 4.5, height = 4, dpi = 300)

# Print the plot
print(MAG_AMR_plot)

# Reverse the order of IS families variable for A-> Z plotting
MAG_IS_elements <- MAG_IS_elements %>%
  mutate(IS_family = factor(IS_family, levels = rev(sort(unique(IS_family)))))

# Plot the IS families in mid & high quality MAGs
MAG_IS_plot <- ggplot(mapping = aes(x = IS_family, fill = Genus), data = MAG_IS_elements) +
  geom_bar() +
  theme_minimal() +
  labs(x = "IS family", y = "Count") +
  coord_flip() +
  scale_fill_manual(
    values = c(
      "Aeromonas" = "#88CCEE",         # Muted Blue
      "Aliarcobacter" = "#117733",    # Deep Green
      "Limnohabitans" = "#DDCC77",    # Muted Yellow
      "Massilia" = "#332288",         # Deep Blue
      "Serratia" = "#CC6677",         # Muted Coral
      "Shewanella" = "#44AA99",       # Muted Teal
      "Stenotrophomonas" = "#AA4499", # Muted Purple
      "Trichormus" = "#999933"        # Muted Olive
    ),
    labels = c(
      "Aeromonas" = expression(italic("Aeromonas")),
      "Aliarcobacter" = expression(italic("Aliarcobacter")),
      "Limnohabitans" = expression(italic("Limnohabitans")),
      "Massilia" = expression(italic("Massilia")),
      "Serratia" = expression(italic("Serratia")),
      "Shewanella" = expression(italic("Shewanella")),
      "Stenotrophomonas" = expression(italic("Stenotrophomonas")),
      "Trichormus" = expression(italic("Trichormus"))
    )
  )

# Save the plot
ggsave("MAG_IS_plot.png", plot = MAG_IS_plot, width = 4.5, height = 4, dpi = 300)

# Print the plot
print(MAG_IS_plot)

Gammaproteobacteria vs other classes

# Gammaproteobacteria (the class within Proteobacteria from which all binned ARGs were identified) vs other bacteria
MAG_data$Gammaproteo <- ifelse(MAG_data$Class == "Gammaproteobacteria", "Yes", "No")

# ARGs per group
gammaproteo_ARGs <- MAG_data$ARGs_per_genome[MAG_data$Gammaproteo == "Yes"]
others_ARGs <- MAG_data$ARGs_per_genome[MAG_data$Gammaproteo == "No"]

# IS elements per group
gammaproteo_IS <- MAG_data$IS_per_genome[MAG_data$Gammaproteo == "Yes"]
others_IS <- MAG_data$IS_per_genome[MAG_data$Gammaproteo == "No"]

ARGs

# Print mean ARGs per complete Gammaproteobacteria genome
print(mean(gammaproteo_ARGs))

## [1] 0.6155669

# Print mean ARGs per complete 'other' genome
print(mean(others_ARGs))

## [1] 0

# Test for statistical difference (non-parametric) using Wilcoxon rank-sum test
wilcox.test(gammaproteo_ARGs, others_ARGs)

## Warning in wilcox.test.default(gammaproteo_ARGs, others_ARGs): cannot compute
## exact p-value with ties

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  gammaproteo_ARGs and others_ARGs
## W = 132, p-value = 0.03913
## alternative hypothesis: true location shift is not equal to 0

IS elements

# Print mean ARGs per complete Gammaproteobacteria genome
print(mean(gammaproteo_IS))

## [1] 1.031057

# Print mean ARGs per complete 'other' genome
print(mean(others_IS))

## [1] 0.568052

# Test for statistical difference (non-parametric) using Wilcoxon rank-sum test
wilcox.test(gammaproteo_IS, others_IS)

## Warning in wilcox.test.default(gammaproteo_IS, others_IS): cannot compute exact
## p-value with ties

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  gammaproteo_IS and others_IS
## W = 129.5, p-value = 0.1223
## alternative hypothesis: true location shift is not equal to 0

The effects of sequencing depth on ARG and MAG identification

MAG recovery

# Plot sequencing depth vs MAGs
depth_vs_MAGs_plot <- ggplot(mapping = aes(total_reads, mid_high_MAG), data = colData(tse)) +
  geom_point() +
  labs(x = "Number of reads", y = "Number of MAGs recovered") +
  scale_x_continuous(labels = scales::comma, limits = c(0, 40000000)) +
  scale_y_continuous(limits = c(0, 5)) +
  theme_bw()

# Save the plot
ggsave("depth_vs_MAGs_plot.png", plot = depth_vs_MAGs_plot, width = 7, height = 5)

# Print the plot
print(depth_vs_MAGs_plot)

# Access variables from colData
variable1 <- colData(tse)$total_reads
variable2 <- colData(tse)$mid_high_MAG

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties

# Print correlation coefficient and its associated p-value
print(cor_test)

## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 443.29, p-value = 0.4561
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2084034

ARG richness

# Plot sequencing depth vs MAGs
depth_vs_ARGs_plot <- ggplot(mapping = aes(total_reads, ARG_richness), data = colData(tse)) +
  geom_point() +
  labs(x = "Number of reads", y = "Number of unique ARGs Detected") +
  scale_x_continuous(labels = scales::comma, limits = c(0, 40000000)) +
  scale_y_continuous(limits = c(0, 16)) +
  theme_bw()

# Save the plot
ggsave("depth_vs_ARGs_plot.png", plot = depth_vs_ARGs_plot, width = 7, height = 5)

# Print the plot
print(depth_vs_ARGs_plot)

# Access variables from colData
variable1 <- colData(tse)$total_reads
variable2 <- colData(tse)$ARG_richness

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties

# Print correlation coefficient and its associated p-value
print(cor_test)

## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 401.43, p-value = 0.3065
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2831559

Rarefaction curves

# Import rarefaction data for samples with more than 20M reads following trimming and filtering
rarefaction_data <- read.csv("rarefaction_data.csv")

# Ensure Sample is a factor
rarefaction_data$Sample <- as.factor(rarefaction_data$Sample)

# Plot rarefaction curves
rarefaction_ARG_plot <- ggplot(rarefaction_data, aes(x = Depth, y = ARGs_detected, color = Sample, linetype = Sample)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  labs(
    x = "Reads post-QC",
    y = "Number of unique ARGs Detected"
  ) +
  scale_linetype_manual(values = c("dotted", "twodash", "dashed", "dotdash", "twodash")) +
  scale_x_continuous(labels = scales::comma, limits = c(0, 30000000)) +
  scale_y_continuous(limits = c(0, 15)) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Save the plot
ggsave("rarefaction_ARG_plot.png", plot = rarefaction_ARG_plot, width = 7, height = 5)

# Print the plot
print(rarefaction_ARG_plot)

Snail_faecal_metagenomics

Angus O’Ferrall

2025-07-04

Taxonomy and recovered metagenome assembled genomes (MAGs)

Phyla

Calculate Proteobacteria mean relative abundance

Orders

Calculate Enterobacterales mean relative abundance

Calculate Enterobacterales mean relative abundance with outlier (Uk1) removed

Exploring relative abundance of Shewanella (genus level)

Exploring relative abundance of Aeromonas (genus level)

Hierarchical clustering of samples and taxa

Diversity

Alpha diversity

Proteobacteria vs alpha-diversity

Beta diversity

ARGs

ARG load

ARG richness (number of ARG references that reads mapped to per sample)

Assembly-based ARG detection

How many MAGs (and MAGs assigned to Proteobacteria) contain ARGs?

Proteobacteria vs ARG load

Proteobacteria vs ARG richness

Contributors to ARG load within the Proteobacteria

Correlation testing between Orders with mean relative abundance > 1% and ARG load

Exploration of the ARGs

Antibiotic Class

ARG cluster90 heatmap

IS elements

ARGs and IS elements in MAGs

Gammaproteobacteria vs other classes

ARGs

IS elements

The effects of sequencing depth on ARG and MAG identification

MAG recovery

ARG richness

Rarefaction curves