# Import the microbiome study data object (Tree Summarized Experiment)
tse <- readRDS("O'Ferrall_snails_2024.rds")
## Attach packages
library(knitr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(mia)
## Loading required package: MultiAssayExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: SingleCellExperiment
## Loading required package: TreeSummarizedExperiment
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## This is mia version 1.14.0
## - Online documentation and vignettes: https://microbiome.github.io/mia/
## - Online book 'Orchestrating Microbiome Analysis (OMA)': https://microbiome.github.io/OMA/docs/devel/
## 
## Attaching package: 'mia'
## The following objects are masked from 'package:dplyr':
## 
##     full_join, inner_join, left_join, right_join
library(miaViz)
## Loading required package: ggraph
## 
## Attaching package: 'miaViz'
## The following object is masked from 'package:mia':
## 
##     plotNMDS
library(pheatmap)
library(ggrepel)
library(patchwork)
library(scater)
## Loading required package: scuttle

Taxonomy and recovered metagenome assembled genomes (MAGs)

# Calculate the number of unique taxa for each taxonomic rank
apply(rowData(tse), 2, function (x) {length(unique(x))})
##     domain     phylum      class      order     family      genus    species 
##          1         39         73        160        352       1246       4773 
## clade_name 
##       4773
# Print total number of high-quality MAGs
print(sum(colData(tse)$high_qual_MAG))
## [1] 8
# Print total number of medium-quality MAGs
print(sum(colData(tse)$mid_qual_MAG))
## [1] 21

Phyla

# Make a relative abundance matrix for phyla
relabundance_matrix_phylum <- assay(altExp(tse, "phylum"), "relabundance")

# Calculate the total relative abundance of each phylum (row sums)
total_relabundance_phylum <- rowSums(relabundance_matrix_phylum)

# Identify the top 7 phyla (NB. these have mean relative abundance > 1%)
top_phyla <- names(sort(total_relabundance_phylum, decreasing = TRUE)[1:7])

# Make a new TSE where the phyla with relative abundance > 1% are recognized, while others are "-Other"
tse_top_7_phyla <- altExp(tse, "phylum")
rowData(tse_top_7_phyla)$phylum <- ifelse(rowData(tse_top_7_phyla)$phylum %in% top_phyla, rowData(tse_top_7_phyla)$phylum, "-Other")
rowData(tse_top_7_phyla)$Rank <- gsub("^p__", "", rowData(tse_top_7_phyla)$phylum)

# Define custom colors
custom_phyla_colors <- c(
  "Actinobacteria" = "#F8766D",
  "Bacteroidetes" = "#7CAE00",
  "Cyanobacteria" = "#00BFC4",
  "Deinococcus-Thermus" = "#C77CFF",
  "Firmicutes" = "#A0522D",
  "Planctomycetes" = "#FF69B4",
  "Proteobacteria" = "#1F4E79",
  "-Other" = "#BEBEBE"
)

# Plot the abundance
phyla_plot <- plotAbundance(tse_top_7_phyla, assay.type = "relabundance", rank = "Rank", add_x_text = TRUE) +
  scale_fill_manual(values = custom_phyla_colors, name = "Phylum") +
  scale_colour_manual(values = custom_phyla_colors, name = "Phylum") +
  guides(fill = guide_legend(override.aes = list(color = NA))) +
  theme(plot.margin = margin(t = 30, r = 10, b = 10, l = 10))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Save the plot
ggsave("phyla_plot.png", plot = phyla_plot, width = 7, height = 5, dpi = 600)

# Print the plot
print(phyla_plot)

Calculate Proteobacteria mean relative abundance

# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_phylum <- relabundance_matrix_phylum * 100

# Extract Proteobacteria
proteobacteria_abundance <- relabundance_matrix_phylum["p__Proteobacteria", , drop = FALSE]

# Calculate mean Proteobacteria relative abundance
mean_proteobacteria_relabund <- sum(proteobacteria_abundance) / 15
print(mean_proteobacteria_relabund)
## [1] 66.27109

Orders

# Make a relative abundance matrix for order
relabundance_matrix_order <- assay(altExp(tse, "order"), "relabundance")

# Calculate the total relative abundance of each order (row sums)
total_relabundance_order <- rowSums(relabundance_matrix_order)

# Identify the top 20 orders (NB. these have mean relative abundance > 1%)
top_orders <- names(sort(total_relabundance_order, decreasing = TRUE)[1:20])

# Make a new TSE where the orders with relative abundance > 1% are recognized, while others are "-Other"
tse_top_20_orders <- altExp(tse, "order")
rowData(tse_top_20_orders)$order <- ifelse(rowData(tse_top_20_orders)$order %in% top_orders, rowData(tse_top_20_orders)$order, "-Other")
rowData(tse_top_20_orders)$Rank <- gsub("^o__", "", rowData(tse_top_20_orders)$order)

custom_order_colors <- c(
  "-Other" = "#BEBEBE",
  "Flavobacteriales" = "yellow",
  "Deinococcales" = "#377EB8",
  "Nostocales" = "#4DAF4A",
  "Bacillales" = "#984EA3",
  "Micromonosporales" = "#FF7F00",
  "Propionibacteriales" = "grey20",
  "Streptomycetales" = "#A65628",
  "Corynebacteriales" = "#00441B",
  "Micrococcales" = "blue",
  "Xanthomonadales" = "#66C2A5",
  "Enterobacterales" = "#FC8D62",
  "Pseudomonadales" = "#8DA0CB",
  "Aeromonadales" = "#D73027",
  "Caulobacterales" = "#A6D854",
  "Rhodospirillales" = "#E6AB02",
  "Sphingomonadales" = "#E5C494",
  "Rhodobacterales" = "#54278F",
  "Rhizobiales" = "#D95F02",
  "Rhodocyclales" = "#7570B3",
  "Burkholderiales" = "#1B9E77"
)

# Plot the abundance
order_plot <- plotAbundance(tse_top_20_orders, assay.type = "relabundance", rank = "Rank", add_x_text = TRUE) +
  scale_fill_manual(values = custom_order_colors, name = "Order") +
  scale_colour_manual(values = custom_order_colors, name = "Order") +
  guides(fill = guide_legend(override.aes = list(color = NA)))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Save the plot
ggsave("order_plot.png", plot = order_plot, width = 8.2, height = 5, dpi = 600)

# Print the plot
print(order_plot)

Calculate Enterobacterales mean relative abundance

# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_order <- relabundance_matrix_order * 100

# Extract Enterobacterales
enterobacterales_abundance <- relabundance_matrix_order["o__Enterobacterales", , drop = FALSE]

# Calculate mean Enterobacterales relative abundance
mean_enterobacterales_relabund <- sum(enterobacterales_abundance) / 15
print(mean_enterobacterales_relabund)
## [1] 4.785705

Calculate Enterobacterales mean relative abundance with outlier (Uk1) removed

# Remove Uk1 from the Enterobacterales relative abundance dataframe
enterobacterales_abundance_minus_Uk1 <- as.data.frame(enterobacterales_abundance)
enterobacterales_abundance_minus_Uk1$Uk1 <- NULL

# Calculate mean Enterobacterales relative abundance without Uk1
mean_enterobacterales_relabund_minus_Uk1 <- sum(enterobacterales_abundance_minus_Uk1) / 14
print(mean_enterobacterales_relabund_minus_Uk1)
## [1] 2.14886

Exploring relative abundance of Shewanella (genus level)

# Make a relative abundance matrix for genus
relabundance_matrix_genus <- assay(altExp(tse, "genus"), "relabundance")

# Multiply relative abundances by 100 to express as percentages
relabundance_matrix_genus <- relabundance_matrix_genus * 100

# Extract Shewanella
shewanella_abundance <- relabundance_matrix_genus["g__Shewanella", , drop = FALSE]

# Print Shewanella relabundance
print(shewanella_abundance)
##                       M1         M2         M3        M4       M5         M6
## g__Shewanella 0.07431693 0.07474352 0.09481099 0.2391963 5.065212 0.06392155
##                      Ug1       Ug2        Ug3         Z1         Z2        Z3
## g__Shewanella 0.09437259 0.2747101 0.07968018 0.05922727 0.08122704 0.1393015
##                       Z4        Uk1      Uk2
## g__Shewanella 0.08427348 0.09382548 1.328371
# Calculate mean Shewanella relative abundance
mean_shewanella_relabundance <- sum(shewanella_abundance) / 15
print(mean_shewanella_relabundance)
## [1] 0.523146
# Remove M5 and Uk2 from the Shewanella relative abundance dataframe
shewanella_abundance_minus_M5_Uk2 <- as.data.frame(shewanella_abundance)
shewanella_abundance_minus_M5_Uk2$M5 <- NULL
shewanella_abundance_minus_M5_Uk2$Uk2 <- NULL

# Calculate mean Shewanella relative abundance without M5 and Uk2 from which OXA-48-like genes were later identified
mean_shewanella_relabundance_minus_M5_Uk2 <- sum(shewanella_abundance_minus_M5_Uk2) / 13
print(mean_shewanella_relabundance_minus_M5_Uk2)
## [1] 0.1118159

Exploring relative abundance of Aeromonas (genus level)

# Extract Aeromonas
aeromonas_abundance <- relabundance_matrix_genus["g__Aeromonas", , drop = FALSE]

# Print Aeromonas relabundance
print(aeromonas_abundance)
##                    M1       M2       M3       M4       M5       M6      Ug1
## g__Aeromonas 6.476615 1.611563 6.747668 12.41048 53.81697 1.920293 3.322136
##                   Ug2      Ug3       Z1       Z2       Z3       Z4      Uk1
## g__Aeromonas 6.234634 29.57037 4.387261 3.711667 5.622518 4.802656 1.444432
##                 Uk2
## g__Aeromonas 4.5081
# Calculate mean Aeromonas relative abundance
mean_aeromonas_relabundance <- sum(aeromonas_abundance) / 15
print(mean_aeromonas_relabundance)
## [1] 9.772491

Hierarchical clustering of samples and taxa

# Obtain read count data for the top 30 Orders
tse_order <- altExp(tse, "order")

# Add clr-transformation
tse_order <- transformAssay(tse_order, assay.type = "counts", method = "relabundance")
tse_order <- transformAssay(tse_order, assay.type = "relabundance",
                              method = "clr", pseudocount = TRUE)
## A pseudocount of 4.57444733324424e-07 was applied.
# Apply a z-transformation across rows (taxa)
tse_order <- transformAssay(tse_order, assay.type = "clr", 
                              MARGIN = "features",
                              method = "standardize", name = "clr_z")

# Get 50 most abundant orders, and subsets the data by them
top_50_orders <- names(sort(total_relabundance_order, decreasing = TRUE)[1:50])
tse_order_subset <- tse_order[top_50_orders, ]

# Calculate the proportion of bacteria accounted for in the top 50 Orders
(sum(assay(tse_order_subset, "relabundance"))) / 15
## [1] 0.9792133
# Gets the assay table
mat <- assay(tse_order_subset, "clr_z")

# Make a dataframe for sample information
sample_data <- as.data.frame(colData(tse_order_subset))
sample_data <- sample_data[, c("Continent", "Administrative_region", "snail_genus"), drop = FALSE]
names(sample_data)[names(sample_data) == "snail_genus"] <- "Snail genus"
names(sample_data)[names(sample_data) == "Administrative_region"] <- "Administrative region"

# Make a dataframe for taxonomic information
taxonomic_data <- as.data.frame(rowData(tse_order_subset))
taxonomic_data <- taxonomic_data[, "phylum", drop = FALSE]
colnames(taxonomic_data)[colnames(taxonomic_data) == "phylum"] <- "Phylum"
taxonomic_data$Phylum <- sub("^p__", "", taxonomic_data$Phylum)

# Set heatmap scaling and colors
breaks <- seq(-ceiling(max(abs(mat))), ceiling(max(abs(mat))), 
              length.out = ifelse( max(abs(mat))>5, 2*ceiling(max(abs(mat))), 10 ) )
colors <- colorRampPalette(c("darkblue", "blue", "white", "red", "darkred"))(length(breaks)-1)

# Set column annotation colors
ann_colors <- list(
  "Administrative region" = c(
    "Malawi" = "forestgreen",
    "Uganda" = "yellowgreen",
    "United Kingdom" = "greenyellow",
    "Zanzibar" = "yellow2"
  ),
  Phylum = c(
    "Actinobacteria" = "#1B9E77",
    "Bacteroidetes" = "#66C2A5",
    "Cyanobacteria" = "#3288BD",
    "Deinococcus-Thermus" = "#5E4FA2",
    "Firmicutes" = "#ABDDA4",
    "Planctomycetes" = "#A6CEE3",
    "Proteobacteria" = "#8DA0CB",
    "Tenericutes" = "#E0F3F8",
    "Verrucomicrobia" = "#00441B"
  ),
  "Snail genus" = c(
    "Anisus" = "darkorange2",
    "Biomphalaria" = "orange",
    "Bulinus" = "red",
    "Cleopatra" = "red3",
    "Lanistes" = "tan4",
    "Lymnaea" = "bisque3",
    "Physella" = "bisque",
    "Pila" = "goldenrod3"
  ),
  Continent = c(
    "Africa" = "grey90",
    "Europe" = "grey35"
  )
)

# Create the heatmap
order_clusters <- pheatmap(mat, annotation_col = sample_data, annotation_row = taxonomic_data, annotation_colors = ann_colors, color = colors, breaks = breaks, legend_width = 0.5)

# Save the plot
ggsave("order_clusters.png", plot = order_clusters, width = 10, height = 8, dpi = 600)

# Print plot
print(order_clusters)

Diversity

Alpha diversity

# Calculate diversity statistics
tse <- addAlpha(tse, index = "shannon")
tse <- addAlpha(tse, index = "gini_simpson")
# Print Shannon's diversity
print(colData(tse)$shannon)
##  [1] 7.039099 6.962225 7.282968 6.887802 3.881060 5.815881 6.333669 6.445851
##  [9] 6.051164 6.954845 7.372464 7.245500 7.300361 4.835209 6.701276
# Print mean Shannon's diversity
print(mean(colData(tse)$shannon))
## [1] 6.473958
# Print Shannon's diversity median
print(median(colData(tse)$shannon))
## [1] 6.887802
# Print Simpson's diversity
print(colData(tse)$gini_simpson)
##  [1] 0.9964133 0.9951006 0.9969621 0.9914432 0.7925213 0.9738414 0.9864923
##  [8] 0.9911865 0.9715998 0.9939893 0.9986682 0.9978338 0.9972464 0.8985658
## [15] 0.9941253
# Print mean Simpson's diversity
print(mean(colData(tse)$gini_simpson))
## [1] 0.9717326
# Print Simpson's diversity median
print(median(colData(tse)$gini_simpson))
## [1] 0.9939893

Proteobacteria vs alpha-diversity

# Add Proteobacteria relative abundance to colData
tse_proteobacteria <- tse[rowData(tse)$phylum %in% c("p__Proteobacteria") & !is.na(rowData(tse)$phylum), ]
proteobacteria_relabund <- colSums(assay(tse_proteobacteria, "relabundance"))
colData(tse)$proteobacteria_relabund <- proteobacteria_relabund
# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$shannon

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

# Print correlation coefficient and its associated p-value
print(cor_test)
## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 916, p-value = 0.01287
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6357143

Beta diversity

# Generate beta-diversity statistics (Bray-Curtis distance)
tse <- addDissimilarity(tse, assay.type = "relabundance" , method = "bray")
bray_data <- metadata(tse)[["bray"]]

# Print values
print(bray_data)
##            M1        M2        M3        M4        M5        M6       Ug1
## M1  0.0000000 0.3950959 0.2578856 0.3099759 0.7660522 0.5444646 0.4956978
## M2  0.3950959 0.0000000 0.4055159 0.4547122 0.7835672 0.5872936 0.4744968
## M3  0.2578856 0.4055159 0.0000000 0.2293442 0.7544281 0.5129921 0.5095420
## M4  0.3099759 0.4547122 0.2293442 0.0000000 0.7060798 0.5080983 0.5440057
## M5  0.7660522 0.7835672 0.7544281 0.7060798 0.0000000 0.8186170 0.7946426
## M6  0.5444646 0.5872936 0.5129921 0.5080983 0.8186170 0.0000000 0.6178323
## Ug1 0.4956978 0.4744968 0.5095420 0.5440057 0.7946426 0.6178323 0.0000000
## Ug2 0.3837108 0.4853790 0.4557216 0.4802522 0.7355626 0.5711390 0.4613960
## Ug3 0.3780083 0.4766061 0.3650904 0.3433664 0.6036607 0.5409692 0.5308463
## Z1  0.4998875 0.5227628 0.4218702 0.4748654 0.7826303 0.6387735 0.5707443
## Z2  0.4271759 0.4772067 0.3131357 0.3968051 0.8023832 0.6310114 0.5866067
## Z3  0.3362135 0.4131174 0.2451802 0.3423994 0.7357711 0.5558053 0.5162808
## Z4  0.3727922 0.3795538 0.2517945 0.3499632 0.7418444 0.5907308 0.5478100
## Uk1 0.7194104 0.7556882 0.7674740 0.7746296 0.8438423 0.7702989 0.7174539
## Uk2 0.4059672 0.4191871 0.4452232 0.4895973 0.7560532 0.5714587 0.5117115
##           Ug2       Ug3        Z1        Z2        Z3        Z4       Uk1
## M1  0.3837108 0.3780083 0.4998875 0.4271759 0.3362135 0.3727922 0.7194104
## M2  0.4853790 0.4766061 0.5227628 0.4772067 0.4131174 0.3795538 0.7556882
## M3  0.4557216 0.3650904 0.4218702 0.3131357 0.2451802 0.2517945 0.7674740
## M4  0.4802522 0.3433664 0.4748654 0.3968051 0.3423994 0.3499632 0.7746296
## M5  0.7355626 0.6036607 0.7826303 0.8023832 0.7357711 0.7418444 0.8438423
## M6  0.5711390 0.5409692 0.6387735 0.6310114 0.5558053 0.5907308 0.7702989
## Ug1 0.4613960 0.5308463 0.5707443 0.5866067 0.5162808 0.5478100 0.7174539
## Ug2 0.0000000 0.4380097 0.5745889 0.5747490 0.4334179 0.5110393 0.7113661
## Ug3 0.4380097 0.0000000 0.5670455 0.5193736 0.4169732 0.4651173 0.7617499
## Z1  0.5745889 0.5670455 0.0000000 0.3211763 0.3837895 0.4210765 0.8087910
## Z2  0.5747490 0.5193736 0.3211763 0.0000000 0.2714854 0.2517693 0.8068478
## Z3  0.4334179 0.4169732 0.3837895 0.2714854 0.0000000 0.2320110 0.7428368
## Z4  0.5110393 0.4651173 0.4210765 0.2517693 0.2320110 0.0000000 0.7765925
## Uk1 0.7113661 0.7617499 0.8087910 0.8068478 0.7428368 0.7765925 0.0000000
## Uk2 0.4193660 0.4487173 0.5983182 0.5381829 0.4245999 0.4586818 0.7089693
##           Uk2
## M1  0.4059672
## M2  0.4191871
## M3  0.4452232
## M4  0.4895973
## M5  0.7560532
## M6  0.5714587
## Ug1 0.5117115
## Ug2 0.4193660
## Ug3 0.4487173
## Z1  0.5983182
## Z2  0.5381829
## Z3  0.4245999
## Z4  0.4586818
## Uk1 0.7089693
## Uk2 0.0000000
# Define custom colors for Snail genus
snail_genus_colors <- list(
 "Snail genus" = c(
    "Anisus" = "darkorange2",
    "Biomphalaria" = "orange",
    "Bulinus" = "red",
    "Cleopatra" = "red3",
    "Lanistes" = "tan4",
    "Lymnaea" = "bisque3",
    "Physella" = "bisque",
    "Pila" = "goldenrod3"
  )
)

# Make a dataset for additional ARG information
annotation <- as.data.frame(colData(tse))
annotation <- annotation[, "snail_genus", drop = FALSE]
names(annotation)[names(annotation) == "snail_genus"] <- "Snail genus"

# Heatmap
bray_curtis_plot <- pheatmap(bray_data, cluster_rows = FALSE, cluster_cols = FALSE,
  annotation_col = annotation, annotation_colors = snail_genus_colors,
  main = "Bray-Curtis distance"
)

# Save the plot
ggsave("bray_curtis_plot.png", bray_curtis_plot, height = 5, width = 8)

# Print the plot
print(bray_curtis_plot)

# Perform multidimensional scaling (MDS) using Bray-Curtis dissimilarity on relative abundance data
tse <- runMDS(tse, FUN = getDissimilarity, method = "bray", assay.type = "relabundance", name = "MDS_bray")

# Extract coordinates
mds_coords <- reducedDims(tse)$MDS_bray

# Create data frame for plotting
pcoa_df <- data.frame(
  Axis1 = mds_coords[,1],
  Axis2 = mds_coords[,2],
  SampleName = rownames(mds_coords)
)

# Add administrative region to the data frame for plotting
pcoa_df$Administrative_region <- colData(tse)$Administrative_region

# Plot PCoA with sample names
pcoa_plot <- ggplot(pcoa_df, aes(x = Axis1, y = Axis2, color = Administrative_region, label = SampleName)) +
  geom_point(size = 2) +
  geom_text_repel(size = 2.5, max.overlaps = 20) +
  theme_minimal() +
  labs(
    x = "PCoA Axis 1",
    y = "PCoA Axis 2",
    color = "Administrative region" 
  )

# Save the plot
ggsave("pcoa_plot.png", pcoa_plot, width = 7, height = 5)

# Print the plot
print(pcoa_plot)

ARGs

ARG load

# Calculate ARG load
colData(tse)$ARG_load <- colSums(assay(altExp(tse, "read"), "abundances"))
# Print ARG load
print(colData(tse)$ARG_load)
##         M1         M2         M3         M4         M5         M6        Ug1 
##  54.029774  59.618260  37.058997  65.323354 357.039501  10.657301  75.585206 
##        Ug2        Ug3         Z1         Z2         Z3         Z4        Uk1 
##  95.780916 329.591742  47.627164   3.161198  27.483552  16.687200  80.439274 
##        Uk2 
##  59.692572
# Calculate mean ARG load
print(mean(colData(tse)$ARG_load))
## [1] 87.98507
# Calculate median ARG load
print(median(colData(tse)$ARG_load))
## [1] 59.61826

ARG richness (number of ARG references that reads mapped to per sample)

# Calculate ARG richness
colData(tse)$ARG_richness <- colSums(assay(altExp(tse, "read"), "pa"))

# Print ARG richness
print(colData(tse)$ARG_richness)
##  M1  M2  M3  M4  M5  M6 Ug1 Ug2 Ug3  Z1  Z2  Z3  Z4 Uk1 Uk2 
##   4  10   3   3  11   2   4  16  12  13   1  10   6   6   5
# Calculate mean ARG richness
print(mean(colData(tse)$ARG_richness))
## [1] 7.066667
# Calculate median ARG richness
print(median(colData(tse)$ARG_richness))
## [1] 6

Assembly-based ARG detection

# Print ARG detected by assembly-based methods
print(colData(tse)$ARGs_assembly)
##  [1] 2 3 0 1 2 1 5 6 5 2 0 1 2 6 2
# Print sum of ARGs detected by assembly-based methods
print(sum((colData(tse)$ARGs_assembly)))
## [1] 38
# Calculate mean: ARGs in assembly-based detection
print(mean(colData(tse)$ARGs_assembly))
## [1] 2.533333
# Calculate variance: ARGs in assembly-based detection
print(var(colData(tse)$ARGs_assembly))
## [1] 4.12381
# Calculate median: ARGs in assembly-based detection
print(median(colData(tse)$ARGs_assembly))
## [1] 2
# Plot assembly vs read ARG detection
read_vs_assembly <- ggplot(mapping = aes(ARGs_assembly, ARG_richness), data = colData(tse)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(x = "ARGs detected by assembly", y = "ARGs detected by read mapping") +
  theme_bw() +
  scale_x_continuous(limits = c(-1, 17), expand = c(0, 0)) +
  scale_y_continuous(limits = c(-1, 17), expand = c(0, 0)) +
  coord_fixed()

# Save the plot
ggsave("read_vs_assembly.png", plot = read_vs_assembly)
## Saving 7 x 5 in image
# Print the plot
print(read_vs_assembly)

How many MAGs (and MAGs assigned to Proteobacteria) contain ARGs?

# Print total number of ARGs in a MAG
print(sum((colData(tse)$ARGs_MAG)))
## [1] 10
# Print total number of ARGs in a Proteobacteria MAG
print(sum((colData(tse)$ARGs_Proteobacteria_MAG)))
## [1] 10

Proteobacteria vs ARG load

# Plot annotated Proteobacteria relative abundance ARG load on log scale to improve visualisation
proteo_ARG_load_annotated <- ggplot(mapping = aes(proteobacteria_relabund, ARG_load, color = colData(tse)$"Administrative_region"), data = colData(tse)) +
  geom_point() +
  geom_text_repel(aes(label = colData(tse)$snail_genus), fontface = "italic", color = "black") +
  labs(x = "Rel. abundance of Proteobacteria", y = "ARG load",
       color = "Administrative region") +
  theme_bw() +
  scale_x_continuous(limits = c(0, 1)) +
  scale_fill_viridis_c(option = "magma")

# Save the plot
ggsave("proteo_ARG_load_annotated.png", plot = proteo_ARG_load_annotated, width = 7, height = 5)

# Print the plot
print(proteo_ARG_load_annotated)

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$ARG_load

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")

# Print correlation coefficient and its associated p-value
print(cor_test)
## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 86, p-value = 6.005e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8464286

Proteobacteria vs ARG richness

# Plot Proteobacteria relative abundance vs ARG richness
proteo_ARG_richness <- ggplot(mapping = aes(proteobacteria_relabund, ARG_richness, color = colData(tse)$"Administrative_region"), data = colData(tse)) +
  geom_point() +
  geom_text_repel(aes(label = colData(tse)$snail_genus), fontface = "italic", color = "black") +
  labs(x = "Rel. abundance of Proteobacteria", y = "ARG richness",
       color = "Administrative region") +
  theme_bw() +
  scale_x_continuous(limits = c(0, 1)) +
  scale_fill_viridis_c(option = "magma")

# Save the plot
ggsave("proteo_ARG_richness.png", plot = proteo_ARG_richness, width = 5, height = 5)

# Print the plot
print(proteo_ARG_richness)

# Access variables from colData
variable1 <- colData(tse)$proteobacteria_relabund
variable2 <- colData(tse)$ARG_richness

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties
# Print correlation coefficient and its associated p-value
print(cor_test)
## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 395.41, p-value = 0.2877
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2939087

Contributors to ARG load within the Proteobacteria

# Make a relative abundance matrix for order
relabundance_matrix_Proteobacteria_order <- assay(altExp(tse_proteobacteria, "order"), "relabundance")

# calculate the total relative abundance of each order (row sums)
total_relabundance_Proteobacteria_order <- rowSums(relabundance_matrix_Proteobacteria_order)

# Identify the top 5 Orders within Proteobacteria
top_Proteobacteria_orders <- names(sort(total_relabundance_Proteobacteria_order, decreasing = TRUE)[1:5])

# Print the top 5 Orders within Proteobacteria
print(top_Proteobacteria_orders)
## [1] "o__Burkholderiales"  "o__Rhizobiales"      "o__Aeromonadales"   
## [4] "o__Pseudomonadales"  "o__Enterobacterales"
# Plot relative abundance vs ARG load for the top 5 Orders within Proteobacteria

# Transpose the matrix so that samples are rows and orders are columns
transposed_relabundance <- t(relabundance_matrix_Proteobacteria_order)

# Ensure row names in transposed matrix match colData(tse)
rownames(transposed_relabundance) <- colnames(relabundance_matrix_Proteobacteria_order)

# Keep only the top 5 Proteobacteria orders
top_5_relabund <- transposed_relabundance[, top_Proteobacteria_orders, drop = FALSE]

# Rename columns: Remove "__o"
colnames(top_5_relabund) <- gsub("o__", "", colnames(top_5_relabund))

# Add relabundance of the top 5 orders as new columns in colData(tse)
for (order in colnames(top_5_relabund)) {
  colData(tse)[[paste0(order, "_relabund")]] <- top_5_relabund[, order]
}

# Define the relative abundance columns
relabund_columns <- c("Aeromonadales_relabund", "Burkholderiales_relabund",
                      "Enterobacterales_relabund", "Pseudomonadales_relabund",
                      "Rhizobiales_relabund")

# Create a list to store the plots
plots_list <- list()

# Loop through each column and create a plot
for (col in relabund_columns) {
  
  # Extract the order name by removing "_relabund"
  order_name <- gsub("_relabund", "", col)
  
  # Create the plot
  plot <- ggplot(mapping = aes_string(x = col, y = "ARG_load", color = "colData(tse)$Administrative_region"), 
                 data = colData(tse)) +
    geom_point() +
    labs(x = paste("Rel. abundance of", order_name), 
         y = "ARG load", color = "Administrative region") +
    scale_x_continuous(limits = c(0, 1)) +
    theme_bw() +
    scale_fill_viridis_c(option = "magma") +
    ggtitle(paste("o_", order_name)) +
    theme(legend.position = "none")  # Remove legend from individual plots
  
  # Store the plot in the list
  plots_list[[order_name]] <- plot
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Combine all plots into one row with a shared legend on the right
combined_order_plot <- wrap_plots(plots_list, ncol = 5) + 
  plot_layout(guides = "collect")  # Collect legends and place one legend

ggsave("combined_order_plot.png", combined_order_plot, height = 5, width = 15)

print(combined_order_plot)

Correlation testing between Orders with mean relative abundance > 1% and ARG load

# Calculate mean relative abundance of all orders
mean_relabundance_order <- rowMeans(relabundance_matrix_order)

# Filter mean_relabundance_order to keep only values > 1%
mean_relabundance_order <- mean_relabundance_order[mean_relabundance_order > 1]

# Order alphabetically
mean_relabundance_order <- mean_relabundance_order[order(names(mean_relabundance_order))]

# Print the updated object
print(mean_relabundance_order)
##       o__Aeromonadales          o__Bacillales     o__Burkholderiales 
##               9.712867               2.363786              16.275879 
##     o__Caulobacterales   o__Corynebacteriales       o__Deinococcales 
##               1.222198               3.285481               1.085390 
##    o__Enterobacterales    o__Flavobacteriales       o__Micrococcales 
##               4.785705               4.075827               3.712140 
##   o__Micromonosporales          o__Nostocales o__Propionibacteriales 
##               1.060213               3.708840               1.449689 
##     o__Pseudomonadales         o__Rhizobiales     o__Rhodobacterales 
##               7.112397              10.002062               3.718957 
##       o__Rhodocyclales    o__Rhodospirillales    o__Sphingomonadales 
##               1.373476               1.354292               2.996436 
##    o__Streptomycetales     o__Xanthomonadales 
##               3.224572               2.735867
# Pearson correlation testing

# Prepare a dataframe for correlation testing between Orders and ARG_load

# Extract the order names from mean_relabundance_order
selected_orders <- names(mean_relabundance_order)

# Initialize a list to store results for the selected orders
order_relabund_list <- list()

# Loop through each order in mean_relabundance_order
for (order in selected_orders) {
  # Subset the data for the current order
  tse_order <- tse[rowData(tse)$order %in% c(order) & !is.na(rowData(tse)$order), ]
  
  # Calculate column sums of the "relabundance" assay for the current order
  order_relabund <- colSums(assay(tse_order, "relabundance"))
  
  # Store the result in the list, with the order name as the key
  order_relabund_list[[order]] <- order_relabund
}

# Combine the list into a data frame (orders as rows)
order_relabund_df <- do.call(rbind, order_relabund_list)

# Convert the list to a data frame
order_relabund_df <- as.data.frame(order_relabund_df)

# Transpose the data frame (samples as rows, order mean relative abundances as columns)
order_relabund_df <- t(order_relabund_df)

# Remove "o__" from the start of each column name
colnames(order_relabund_df) <- gsub("^o__", "", colnames(order_relabund_df))

# Access the variables
variable1 <- colData(tse)$ARG_load
variable2 <- as.numeric(order_relabund_df[, "Aeromonadales"])
variable3 <- as.numeric(order_relabund_df[, "Bacillales"])
variable4 <- as.numeric(order_relabund_df[, "Burkholderiales"])
variable5 <- as.numeric(order_relabund_df[, "Caulobacterales"])
variable6 <- as.numeric(order_relabund_df[, "Corynebacteriales"])
variable7 <- as.numeric(order_relabund_df[, "Deinococcales"])
variable8 <- as.numeric(order_relabund_df[, "Enterobacterales"])
variable9 <- as.numeric(order_relabund_df[, "Flavobacteriales"])
variable10 <- as.numeric(order_relabund_df[, "Micrococcales"])
variable11 <- as.numeric(order_relabund_df[, "Micromonosporales"])
variable12 <- as.numeric(order_relabund_df[, "Nostocales"])
variable13 <- as.numeric(order_relabund_df[, "Propionibacteriales"])
variable14 <- as.numeric(order_relabund_df[, "Pseudomonadales"])
variable15 <- as.numeric(order_relabund_df[, "Rhizobiales"])
variable16 <- as.numeric(order_relabund_df[, "Rhodobacterales"])
variable17 <- as.numeric(order_relabund_df[, "Rhodocyclales"])
variable18 <- as.numeric(order_relabund_df[, "Rhodospirillales"])
variable19 <- as.numeric(order_relabund_df[, "Sphingomonadales"])
variable20 <- as.numeric(order_relabund_df[, "Streptomycetales"])
variable21 <- as.numeric(order_relabund_df[, "Xanthomonadales"])

# Calculate Pearson's correlation coefficients and p-values
cor_test_Aeromonadales <- cor.test(variable1, variable2, method = "spearman")
cor_test_Bacillales <- cor.test(variable1, variable3, method = "spearman")
cor_test_Burkholderiales <- cor.test(variable1, variable4, method = "spearman")
cor_test_Caulobacterales <- cor.test(variable1, variable5, method = "spearman")
cor_test_Corynebacteriales <- cor.test(variable1, variable6, method = "spearman")
cor_test_Deinococcales <- cor.test(variable1, variable7, method = "spearman")
cor_test_Enterobacterales <- cor.test(variable1, variable8, method = "spearman")
cor_test_Flavobacteriales <- cor.test(variable1, variable9, method = "spearman")
cor_test_Micrococcales <- cor.test(variable1, variable10, method = "spearman")
cor_test_Micromonosporales <- cor.test(variable1, variable11, method = "spearman")
cor_test_Nostocales <- cor.test(variable1, variable12, method = "spearman")
cor_test_Propionibacteriales <- cor.test(variable1, variable13, method = "spearman")
cor_test_Pseudomonadales <- cor.test(variable1, variable14, method = "spearman")
cor_test_Rhizobiales <- cor.test(variable1, variable15, method = "spearman")
cor_test_Rhodobacterales <- cor.test(variable1, variable16, method = "spearman")
cor_test_Rhodocyclales <- cor.test(variable1, variable17, method = "spearman")
cor_test_Rhodospirillales <- cor.test(variable1, variable18, method = "spearman")
cor_test_Sphingomonadales <- cor.test(variable1, variable19, method = "spearman")
cor_test_Streptomycetales <- cor.test(variable1, variable20, method = "spearman")
cor_test_Xanthomonadales <- cor.test(variable1, variable21, method = "spearman")

# Extract the correlation coefficients and p-values from each test
cor_results <- data.frame(
  Order = c("Aeromonadales", "Bacillales", "Burkholderiales", "Caulobacterales", 
            "Corynebacteriales", "Deinococcales", "Enterobacterales", "Flavobacteriales", 
            "Micrococcales", "Micromonosporales", "Nostocales", "Propionibacteriales", 
            "Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodocyclales", 
            "Rhodospirillales", "Sphingomonadales", "Streptomycetales", "Xanthomonadales"),
  CorrelationCoefficient = c(cor_test_Aeromonadales$estimate, cor_test_Bacillales$estimate, 
                            cor_test_Burkholderiales$estimate, cor_test_Caulobacterales$estimate, 
                            cor_test_Corynebacteriales$estimate, cor_test_Deinococcales$estimate, 
                            cor_test_Enterobacterales$estimate, cor_test_Flavobacteriales$estimate, 
                            cor_test_Micrococcales$estimate, cor_test_Micromonosporales$estimate, 
                            cor_test_Nostocales$estimate, cor_test_Propionibacteriales$estimate, 
                            cor_test_Pseudomonadales$estimate, cor_test_Rhizobiales$estimate, 
                            cor_test_Rhodobacterales$estimate, cor_test_Rhodocyclales$estimate, 
                            cor_test_Rhodospirillales$estimate, cor_test_Sphingomonadales$estimate, 
                            cor_test_Streptomycetales$estimate, cor_test_Xanthomonadales$estimate),
  PValue = c(cor_test_Aeromonadales$p.value, cor_test_Bacillales$p.value, 
             cor_test_Burkholderiales$p.value, cor_test_Caulobacterales$p.value, 
             cor_test_Corynebacteriales$p.value, cor_test_Deinococcales$p.value, 
             cor_test_Enterobacterales$p.value, cor_test_Flavobacteriales$p.value, 
             cor_test_Micrococcales$p.value, cor_test_Micromonosporales$p.value, 
             cor_test_Nostocales$p.value, cor_test_Propionibacteriales$p.value, 
             cor_test_Pseudomonadales$p.value, cor_test_Rhizobiales$p.value, 
             cor_test_Rhodobacterales$p.value, cor_test_Rhodocyclales$p.value, 
             cor_test_Rhodospirillales$p.value, cor_test_Sphingomonadales$p.value, 
             cor_test_Streptomycetales$p.value, cor_test_Xanthomonadales$p.value)
)

# View the resulting data frame with correlation coefficients and p-values
print(cor_results)
##                  Order CorrelationCoefficient      PValue
## 1        Aeromonadales            0.357142857 0.191620173
## 2           Bacillales           -0.303571429 0.270772063
## 3      Burkholderiales            0.085714286 0.762997252
## 4      Caulobacterales            0.242857143 0.381991520
## 5    Corynebacteriales           -0.439285714 0.103199216
## 6        Deinococcales           -0.103571429 0.714423628
## 7     Enterobacterales           -0.110714286 0.695276099
## 8     Flavobacteriales            0.210714286 0.449865622
## 9        Micrococcales           -0.528571429 0.045425102
## 10   Micromonosporales           -0.653571429 0.010031783
## 11          Nostocales           -0.710714286 0.004055074
## 12 Propionibacteriales           -0.664285714 0.008578169
## 13     Pseudomonadales            0.603571429 0.019540831
## 14         Rhizobiales           -0.442857143 0.100186234
## 15     Rhodobacterales           -0.332142857 0.226353138
## 16       Rhodocyclales           -0.075000000 0.792571062
## 17    Rhodospirillales           -0.478571429 0.073466155
## 18    Sphingomonadales           -0.082142857 0.772822127
## 19    Streptomycetales           -0.660714286 0.009043086
## 20     Xanthomonadales           -0.007142857 0.984662517

Exploration of the ARGs

Antibiotic Class

altExp(tse, "read_abclass") <- agglomerateByVariable(
  altExp(tse, "read"),
  rowData(altExp(tse, "read"))$Class,
  by = "rows",
  assay.type = "abundances"
)
## Warning: 'pa' includes binary values.
## Agglomeration of it might lead to meaningless values.
## Check the assay, and consider doing transformation againmanually with agglomerated data.
# Convert to relative abundance
altExp(tse, "read_abclass") <- transformAssay(altExp(tse, "read_abclass"), assay.type = "abundances", method = "relabundance")

# Plot the abundance
ARG_class_plot <- plotAbundance(altExp(tse, "read_abclass"), assay.type = "relabundance", rank = "Class", add_x_text = TRUE)

# Save the plot
ggsave("ARG_class_plot.png", plot = ARG_class_plot, width = 7, height = 4, dpi = 600)

# Print the plot
print(ARG_class_plot)

# Make a relative abundance matrix (RPKM) for ARG class
altExp(tse, "read_abclass") <- transformAssay(altExp(tse, "read_abclass"), assay.type = "abundances", method = "relabundance")
relabundance_matrix_class <- assay(altExp(tse, "read_abclass"), "relabundance")

# Calculate the mean relative abundance of each ARG class
mean_relabundance_class <- (rowSums(relabundance_matrix_class)) / 15

# Order the ARG classes by mean relative abundance
top_classes <- names(sort(mean_relabundance_class, decreasing = TRUE)[1:14])

# Print mean ARG class relative abundance
print(mean_relabundance_class)
##           Aminoglycoside              Beta-lactam Beta-lactam (carbapenem) 
##             0.0277312238             0.5357815461             0.1027188513 
##          Chloramphenicol                 Colistin               Fosfomycin 
##             0.0046715233             0.1438166043             0.0215088775 
##             Fusidic acid              Lincosamide                Macrolide 
##             0.0019493327             0.0051260230             0.0214093242 
##        MDR (efflux pump)                Quinolone              Sulfonamide 
##             0.0115892460             0.0003296112             0.0532232268 
##             Tetracycline             Trimethoprim 
##             0.0511944610             0.0189501488

ARG cluster90 heatmap

# Create an ARG cluster AltExp
altExp(tse, "arg_cluster") <- agglomerateByVariable(altExp(tse, "read"), rowData(altExp(tse, "read"))$Cluster90_description, by = "rows")
## Warning: 'pa' includes binary values.
## Agglomeration of it might lead to meaningless values.
## Check the assay, and consider doing transformation againmanually with agglomerated data.
# Add log abundance assay
tse_arg_cluster <- altExp(tse, "arg_cluster")
tse_arg_cluster <- transformAssay(tse_arg_cluster, assay.type = "abundances",
                                  method = "log", pseudocount = TRUE, name = "log")
## A pseudocount of 0.137471073839662 was applied.
## Warning: non-integer data: divided by smallest positive value
# Get the assay table
mat <- assay(tse_arg_cluster, "log")

# Make a dataset for additional ARG information
arg_data <- as.data.frame(rowData(tse_arg_cluster))
arg_data <- arg_data[, "Class", drop = FALSE]

# Order the ARGs by their class
arg_data_ordered <- arg_data[order(arg_data$Class), , drop = FALSE]

# Reorder the data matrix rows according to the ordered ARGs
mat <- mat[rownames(arg_data_ordered), ]

# Define a color palette that goes from white to red
colors <- colorRampPalette(c("white", "red"))(100)

# Create the heatmap with the custom color palette
ARG_cluster_heatmap <- pheatmap(mat, 
                                annotation_row = arg_data_ordered, 
                                cluster_rows = FALSE, cluster_cols = FALSE, 
                                cellwidth = 15, cellheight = 10,
                                color = colors)
# Save the plot
ggsave("ARG_heatmap.png", plot = ARG_cluster_heatmap, width = 7, height = 4, dpi = 600)

# Print the plot
print(ARG_cluster_heatmap)

IS elements

# Print the number of IS elements in metagenome assemblies
print(colData(tse)$IS_assembled)
##  [1] 12 29  8 11 35 48 97 58 67 13 19 28 25 70 47
# Calculate mean number of IS elements in assemblies
print(mean(colData(tse)$IS_assembled))
## [1] 37.8
# Calculate median  number of IS elements in assemblies
print(median(colData(tse)$IS_assembled))
## [1] 29
# Calculate no. of transposable elements per 1,000,000,000 assembled bases
colData(tse)$IS_per_Mb <- (colData(tse)$IS_assembled / colData(tse)$assembly_length) * 1000000

# Calculate no. of ARGs per 1,000,000,000 assembled bases
colData(tse)$ARGs_per_Mb <- (colData(tse)$ARGs_assembly / colData(tse)$assembly_length) * 1000000

ARGs and IS elements in MAGs

# Load MAG_data
MAG_data <- read.csv("MAG_data.csv")
MAG_AMR <- read.csv("MAG_AMR.csv")
MAG_IS_elements <- read.csv("MAG_IS_elements.csv")

# ARGs per complete genome
MAG_data$ARGs_per_genome <- MAG_data$no_ARGs/(MAG_data$Completeness/100)

# IS elements per complete genome
MAG_data$IS_per_genome <- MAG_data$no_IS/(MAG_data$Completeness/100)
# Reverse the order of Genus variable for A-> Z plotting
MAG_data <- MAG_data %>%
  mutate(Genus = factor(Genus, levels = rev(sort(unique(Genus)))))

# Plot the genera of mid & high quality MAGs
MAG_taxa <- ggplot(mapping = aes(x = Genus, fill = Quality), data = MAG_data) +
  geom_bar() +
  theme_minimal() +
  labs(y = "Count", fill = "MAG quality") +
  scale_x_reverse() +
  coord_flip() +
  scale_fill_manual(
    values = c(
      "High" = "#CCCCCC",  # Light Gray
      "Medium" = "#666666"    # Dark Gray
    )
  ) +
  scale_x_discrete(
    labels = function(x) sapply(x, function(i) bquote(italic(.(i)))) # Italicize genus names
  )
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Save the plot
ggsave("MAG_taxa.png", plot = MAG_taxa, width = 4.5, height = 4, dpi = 300)

# Print the plot
print(MAG_taxa)

# Reverse the order of AMR determinant variable for A -> Z plotting
MAG_AMR <- MAG_AMR %>%
  mutate(AMR_determinant = factor(AMR_determinant, levels = rev(sort(unique(AMR_determinant)))))

# Plot the AMR determinants in mid & high quality MAGs
MAG_AMR_plot <- ggplot(mapping = aes(x = AMR_determinant, fill = Genus), data = MAG_AMR) +
  geom_bar() +
  theme_minimal() +
  labs(x = "AMR determinant", y = "Count") +
  coord_flip() +
  scale_fill_manual(
    values = c(
      "Aeromonas" = "#88CCEE",
      "Shewanella" = "#44AA99",
      "Stenotrophomonas" = "#AA4499"
    ),
    labels = c(
      "Aeromonas" = expression(italic("Aeromonas")),
      "Shewanella" = expression(italic("Shewanella")),
      "Stenotrophomonas" = expression(italic("Stenotrophomonas"))
    )
  )


# Save the plot
ggsave("MAG_AMR_plot.png", plot = MAG_AMR_plot, width = 4.5, height = 4, dpi = 300)

# Print the plot
print(MAG_AMR_plot)

# Reverse the order of IS families variable for A-> Z plotting
MAG_IS_elements <- MAG_IS_elements %>%
  mutate(IS_family = factor(IS_family, levels = rev(sort(unique(IS_family)))))

# Plot the IS families in mid & high quality MAGs
MAG_IS_plot <- ggplot(mapping = aes(x = IS_family, fill = Genus), data = MAG_IS_elements) +
  geom_bar() +
  theme_minimal() +
  labs(x = "IS family", y = "Count") +
  coord_flip() +
  scale_fill_manual(
    values = c(
      "Aeromonas" = "#88CCEE",         # Muted Blue
      "Aliarcobacter" = "#117733",    # Deep Green
      "Limnohabitans" = "#DDCC77",    # Muted Yellow
      "Massilia" = "#332288",         # Deep Blue
      "Serratia" = "#CC6677",         # Muted Coral
      "Shewanella" = "#44AA99",       # Muted Teal
      "Stenotrophomonas" = "#AA4499", # Muted Purple
      "Trichormus" = "#999933"        # Muted Olive
    ),
    labels = c(
      "Aeromonas" = expression(italic("Aeromonas")),
      "Aliarcobacter" = expression(italic("Aliarcobacter")),
      "Limnohabitans" = expression(italic("Limnohabitans")),
      "Massilia" = expression(italic("Massilia")),
      "Serratia" = expression(italic("Serratia")),
      "Shewanella" = expression(italic("Shewanella")),
      "Stenotrophomonas" = expression(italic("Stenotrophomonas")),
      "Trichormus" = expression(italic("Trichormus"))
    )
  )

# Save the plot
ggsave("MAG_IS_plot.png", plot = MAG_IS_plot, width = 4.5, height = 4, dpi = 300)

# Print the plot
print(MAG_IS_plot)

Gammaproteobacteria vs other classes

# Gammaproteobacteria (the class within Proteobacteria from which all binned ARGs were identified) vs other bacteria
MAG_data$Gammaproteo <- ifelse(MAG_data$Class == "Gammaproteobacteria", "Yes", "No")

# ARGs per group
gammaproteo_ARGs <- MAG_data$ARGs_per_genome[MAG_data$Gammaproteo == "Yes"]
others_ARGs <- MAG_data$ARGs_per_genome[MAG_data$Gammaproteo == "No"]

# IS elements per group
gammaproteo_IS <- MAG_data$IS_per_genome[MAG_data$Gammaproteo == "Yes"]
others_IS <- MAG_data$IS_per_genome[MAG_data$Gammaproteo == "No"]

ARGs

# Print mean ARGs per complete Gammaproteobacteria genome
print(mean(gammaproteo_ARGs))
## [1] 0.6155669
# Print mean ARGs per complete 'other' genome
print(mean(others_ARGs))
## [1] 0
# Test for statistical difference (non-parametric) using Wilcoxon rank-sum test
wilcox.test(gammaproteo_ARGs, others_ARGs)
## Warning in wilcox.test.default(gammaproteo_ARGs, others_ARGs): cannot compute
## exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  gammaproteo_ARGs and others_ARGs
## W = 132, p-value = 0.03913
## alternative hypothesis: true location shift is not equal to 0

IS elements

# Print mean ARGs per complete Gammaproteobacteria genome
print(mean(gammaproteo_IS))
## [1] 1.031057
# Print mean ARGs per complete 'other' genome
print(mean(others_IS))
## [1] 0.568052
# Test for statistical difference (non-parametric) using Wilcoxon rank-sum test
wilcox.test(gammaproteo_IS, others_IS)
## Warning in wilcox.test.default(gammaproteo_IS, others_IS): cannot compute exact
## p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  gammaproteo_IS and others_IS
## W = 129.5, p-value = 0.1223
## alternative hypothesis: true location shift is not equal to 0

The effects of sequencing depth on ARG and MAG identification

MAG recovery

# Plot sequencing depth vs MAGs
depth_vs_MAGs_plot <- ggplot(mapping = aes(total_reads, mid_high_MAG), data = colData(tse)) +
  geom_point() +
  labs(x = "Number of reads", y = "Number of MAGs recovered") +
  scale_x_continuous(labels = scales::comma, limits = c(0, 40000000)) +
  scale_y_continuous(limits = c(0, 5)) +
  theme_bw()

# Save the plot
ggsave("depth_vs_MAGs_plot.png", plot = depth_vs_MAGs_plot, width = 7, height = 5)

# Print the plot
print(depth_vs_MAGs_plot)

# Access variables from colData
variable1 <- colData(tse)$total_reads
variable2 <- colData(tse)$mid_high_MAG

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties
# Print correlation coefficient and its associated p-value
print(cor_test)
## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 443.29, p-value = 0.4561
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2084034

ARG richness

# Plot sequencing depth vs MAGs
depth_vs_ARGs_plot <- ggplot(mapping = aes(total_reads, ARG_richness), data = colData(tse)) +
  geom_point() +
  labs(x = "Number of reads", y = "Number of unique ARGs Detected") +
  scale_x_continuous(labels = scales::comma, limits = c(0, 40000000)) +
  scale_y_continuous(limits = c(0, 16)) +
  theme_bw()

# Save the plot
ggsave("depth_vs_ARGs_plot.png", plot = depth_vs_ARGs_plot, width = 7, height = 5)

# Print the plot
print(depth_vs_ARGs_plot)

# Access variables from colData
variable1 <- colData(tse)$total_reads
variable2 <- colData(tse)$ARG_richness

# Calculate Pearson's correlation coefficient and p-value
cor_test <- cor.test(variable1, variable2, method = "spearman")
## Warning in cor.test.default(variable1, variable2, method = "spearman"): Cannot
## compute exact p-value with ties
# Print correlation coefficient and its associated p-value
print(cor_test)
## 
##  Spearman's rank correlation rho
## 
## data:  variable1 and variable2
## S = 401.43, p-value = 0.3065
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2831559

Rarefaction curves

# Import rarefaction data for samples with more than 20M reads following trimming and filtering
rarefaction_data <- read.csv("rarefaction_data.csv")

# Ensure Sample is a factor
rarefaction_data$Sample <- as.factor(rarefaction_data$Sample)

# Plot rarefaction curves
rarefaction_ARG_plot <- ggplot(rarefaction_data, aes(x = Depth, y = ARGs_detected, color = Sample, linetype = Sample)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  labs(
    x = "Reads post-QC",
    y = "Number of unique ARGs Detected"
  ) +
  scale_linetype_manual(values = c("dotted", "twodash", "dashed", "dotdash", "twodash")) +
  scale_x_continuous(labels = scales::comma, limits = c(0, 30000000)) +
  scale_y_continuous(limits = c(0, 15)) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Save the plot
ggsave("rarefaction_ARG_plot.png", plot = rarefaction_ARG_plot, width = 7, height = 5)

# Print the plot
print(rarefaction_ARG_plot)