1 Data Input/Cleaning

1.1 Download Packages

# Install Packages
#install.packages("HelpersMG")
#install.packages("WGCNA")

# The GO.db package will not let me load WGCNA, so let's download it
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("GO.db", version = "3.8")

# Load the packages
library(HelpersMG)
library(WGCNA)
library(tidyverse)
library(phyloseq)

# Set the working directory
current_directory <- getwd()
paste("Working in directory:", current_directory)

## [1] "Working in directory: /Users/marschmi/git_repos/HNA_LNA_productivity"

1.2 Load Data

# Load data
load("data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_5in10_physeqs.RData")
rare_muskegon_physeq_5in10_abs

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 482 taxa and 62 samples ]
## sample_data() Sample Data:       [ 62 samples by 18 sample variables ]
## tax_table()   Taxonomy Table:    [ 482 taxa by 7 taxonomic ranks ]

# The following setting is important, do not omit.
options(stringsAsFactors = FALSE)

# Prepare CLR transformed OTU Table 
clr_otu <- read.csv("data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_clr_otu_5in10.csv") %>%
  tibble::column_to_rownames(var = "X") 
# Prepare Taxonomy Table 
clr_tax <- tax_table(rare_muskegon_physeq_5in10_abs)
# Prepare Metadata file 
clr_metadata <- 
  sample_data(rare_muskegon_physeq_5in10_abs) %>%
  mutate(norep_filter_name = paste(substr(Sample_16S, 1,4), substr(Sample_16S, 6, 9), sep=""),
         # duplicate Sample_16S column to move it to the rownames
         names = Sample_16S) %>%
  tibble::column_to_rownames(var = "names") 

# Create the phyloseq object
clr_physeq <- merge_phyloseq(otu_table(clr_otu, taxa_are_rows = FALSE), clr_tax, sample_data(clr_metadata))

# Read in the producutivity data
prod_data <- read.csv("data/production_data.csv") %>%
  dplyr::filter(fraction == "Free" & limnion == "Surface") %>%  # Select only rows that are free-living
  dplyr::select(names, tot_bacprod, SD_tot_bacprod) %>%         # Select relevant columns for total productivity
  mutate(tot_bacprod = round(tot_bacprod, digits = 2),          # Round to 2 decimals
         SD_tot_bacprod = round(SD_tot_bacprod, digits = 2) ) %>%      
  dplyr::rename(norep_filter_name = names) %>%                  # Rename to match other data frame
  arrange(norep_filter_name)

# Put together a dataframe with metadata and production data
meta_prod_dat <-
  prod_data %>%
  left_join(clr_metadata ,by = "norep_filter_name") %>%
  mutate(names = Sample_16S) %>%
  tibble::column_to_rownames(var = "names")

###### FINAL PHYLOSEQ OBJECT
prod_clr_physeq_orig <- subset_samples(clr_physeq, Sample_16S %in% meta_prod_dat$Sample_16S)
# Overwrite the sample data to include productivity data
prod_clr_physeq <- merge_phyloseq(otu_table(prod_clr_physeq_orig), tax_table(prod_clr_physeq_orig), sample_data(meta_prod_dat))
prod_clr_physeq

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 482 taxa and 23 samples ]
## sample_data() Sample Data:       [ 23 samples by 21 sample variables ]
## tax_table()   Taxonomy Table:    [ 482 taxa by 7 taxonomic ranks ]

# Pull out the information and clean up the data
prod_clr_otu <- otu_table(prod_clr_physeq)
datOTUClr <- as.data.frame(prod_clr_otu)

# Metadata 
prod_clr_dat <- data.frame(sample_data(prod_clr_physeq))

# Select only the numeric traits
datContinuous <- 
  prod_clr_dat %>%
  dplyr::select(c(tot_bacprod, SD_tot_bacprod, Total.cells, Total.count.sd, 
                  HNA.cells, HNA.sd, LNA.cells, LNA.sd, Total_Sequences))
head(datContinuous)

##           tot_bacprod SD_tot_bacprod Total.cells Total.count.sd HNA.cells
## MBRE1F714       33.86           1.30    10104066       55102.71   2724102
## MBRE1F914       24.72           0.36     7206483       65178.68   1928397
## MBRE1F715       11.87           2.15     7420899       70408.32   1343202
## MBRE2F515       29.34          10.78     4326956       96174.94   1739097
## MBRE2F915       22.23           0.83     4895228       22853.30   1264624
## MDPE1F514       43.30           4.65     4412738      112621.94   2396948
##              HNA.sd LNA.cells    LNA.sd Total_Sequences
## MBRE1F714  57003.29   7379964  57715.92           33038
## MBRE1F914  42404.03   5278086  54019.71           24538
## MBRE1F715  44613.63   6077697  31710.77           17500
## MBRE2F515  70451.47   2587859  31361.59           27722
## MBRE2F915  16800.30   3630604  14092.64           21845
## MDPE1F514 114096.60   2015790 100625.07          104975

1.3 Data Cleaning

Below is an analysis to find out whether there are outliers. Thankfully, there are none!

# Check for OTUs and samples with too many missing values
good_otus <- goodSamplesGenes(datOTUClr, verbose = 3);

##  Flagging genes and samples with too many missing values...
##   ..step 1

good_otus$allOK

## [1] TRUE

# Search the data for outliers via clustering
#sampleTree <- hclust(dist(datOTUClr), method = "average")
#sampleTree

# Plot it 
#sizeGrWindow(12, 9) # Set plotting window to be 12 by 9 inches
#par(cex = 1, mar = c(0,4,2,0))
#plot(sampleTree, main = "Sample Clustering: No Outliers", 
#     sub = "", xlab = "", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)

# Clust contains the samples we want to keep
paste("There are", ntaxa(prod_clr_physeq), "genes in the dataset.")

## [1] "There are 482 genes in the dataset."

paste("There are", nsamples(prod_clr_physeq), "samples in the dataset.")

## [1] "There are 23 samples in the dataset."

# Re-cluster samples
sampleTree2 <- hclust(dist(datOTUClr), method = "average")

# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors <- numbers2colors(datContinuous, signed = FALSE);

# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datContinuous),
                    main = "Sample dendrogram and trait heatmap")

Figure: Clustering dendrogram of samples based on their Euclidean distance.

1.4 Save the data

save(prod_clr_physeq, datOTUClr, datContinuous, file = "data/WGCNA/WGCNA_data.RData")

2 Automatic network construction & Model Detection

Automatic Netrowk Construction Tutorial Here

2.1 Choosing the soft-thresholding power: analysis of network topology

# Choose a set of soft-thresholding powers
powers <- c(c(1:10), seq(from = 12, to=20, by=2))

# Call the network topology analysis function
sft <- pickSoftThreshold(datOTUClr, powerVector = powers, verbose = 5)

## pickSoftThreshold: will use block size 482.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 482 of 482
##    Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.225 -0.69        0.73300 130.000  127.0000  214.0
## 2      2    0.834 -1.10        0.92500  52.200   46.6000  125.0
## 3      3    0.925 -1.22        0.93700  25.900   20.4000   84.5
## 4      4    0.943 -1.26        0.93100  14.700    9.9900   62.4
## 5      5    0.910 -1.31        0.88500   9.200    5.2700   49.0
## 6      6    0.813 -1.42        0.76400   6.200    2.9300   40.1
## 7      7    0.896 -1.33        0.86600   4.430    1.6900   33.8
## 8      8    0.939 -1.26        0.93400   3.310    1.0200   29.2
## 9      9    0.893 -1.28        0.87600   2.560    0.6470   25.8
## 10    10    0.206 -2.23       -0.00829   2.050    0.4300   23.1
## 11    12    0.839 -1.25        0.81100   1.400    0.2010   19.2
## 12    14    0.187 -1.86       -0.04490   1.020    0.0960   16.4
## 13    16    0.843 -1.20        0.82200   0.788    0.0505   14.4
## 14    18    0.165 -1.62       -0.04570   0.629    0.0267   12.9
## 15    20    0.197 -1.71       -0.03110   0.517    0.0152   11.6

# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2))

# Set some parameters
cex1 = 0.9

# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], 
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", 
     main = paste("Scale independence"))

text(sft$fitIndices[,1], -sign(sft$fitIndices[,3]) * sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")

# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5], 
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

2.1.0.1 The soft threshold is 3!

# Turn adjacency into topological overlap matrix (TOM)
adjacency <- adjacency(datOTUClr, power = 3)
TOMadj <- TOMsimilarity(adjacency)

## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

dissTOMadj <- 1- TOMadj


# Clustering using TOM
# Call the hierarchical clustering function 
hclustGeneTree <- hclust(as.dist(dissTOMadj), method = "average")

# Plot the resulting clustering tree (dendogram)
sizeGrWindow(12, 9)
plot(hclustGeneTree, xlab = "", sub = "", 
     main = "Gene Clustering on TOM-based disssimilarity", 
     labels = FALSE, hang = 0.04)

# Make the modules larger, so set the minimum higher
minModuleSize <- 12

# Module ID using dynamic tree cut
dynamicMods <- cutreeDynamic(dendro = hclustGeneTree, 
                             distM = dissTOMadj,
                             deepSplit = 2, pamRespectsDendro = FALSE,
                             minClusterSize = minModuleSize)

##  ..cutHeight not given, setting it to 0.964  ===>  99% of the (truncated) height range in dendro.
##  ..done.

table(dynamicMods)

## dynamicMods
##   1   2   3   4   5 
## 315  61  38  36  32

# Convert numeric lables into colors
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)

## dynamicColors
##      blue     brown     green turquoise    yellow 
##        61        38        32       315        36

# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(hclustGeneTree, dynamicColors, "Dynamic Tree Cut", 
                    dendroLabels = FALSE, hang = 0.03, 
                    addGuide = TRUE, guideHang = 0.05, 
                    main = "Gene dendrogram and module colors")

# Calculate eigengenes
dynamic_MEList <- moduleEigengenes(datOTUClr, colors = dynamicColors)
dynamic_MEs <- dynamic_MEList$eigengenes

# Calculate dissimilarity of module eigengenes
dynamic_MEDiss <- 1-cor(dynamic_MEs)
dynamic_METree <- hclust(as.dist(dynamic_MEDiss))
# Plot the hclust
sizeGrWindow(7,6)
plot(dynamic_METree, main = "Dynamic Clustering of module eigengenes",
     xlab = "", sub = "")

######################## MERGE SIMILAR MODULES
dynamic_MEDissThres <- 0.25

# Plot the cut line
#abline(h = dynamic_MEDissThres, col = "red")

# Call an automatic merging function
merge_dynamic_MEDs <- mergeCloseModules(datOTUClr, dynamicColors, cutHeight = dynamic_MEDissThres, verbose = 3)

##  mergeCloseModules: Merging modules whose distance is less than 0.25
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 5 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 5 module eigengenes in given set.

# The Merged Colors
dynamic_mergedColors <- merge_dynamic_MEDs$colors

# Eigen genes of the new merged modules
mergedMEs <- merge_dynamic_MEDs$newMEs
mergedMEs

##                MEblue      MEgreen      MEbrown MEturquoise    MEyellow
## MBRE1F714 -0.07477574 -0.135290324  0.416986081 -0.17757671 -0.13452258
## MBRE1F914  0.14871655  0.501261302 -0.150849213 -0.13435389 -0.27162703
## MBRE1F715 -0.11141899 -0.044305166  0.041025250 -0.10862524  0.29468872
## MBRE2F515 -0.13225971 -0.074095503 -0.056890112  0.26052456  0.29750960
## MBRE2F915  0.40051486 -0.022608041 -0.090142177 -0.16335447  0.05200960
## MDPE1F514 -0.15418589 -0.091568612 -0.145805291  0.37591018 -0.11844501
## MDPE1F714 -0.14999524 -0.225154636  0.366447475 -0.16630213 -0.28959076
## MDPE1F914  0.13727902  0.592327410 -0.134733998 -0.14500424 -0.26859169
## MDPE1F715 -0.15252747 -0.125455030  0.078769473 -0.12329157  0.15924140
## MDPE2F515 -0.10125151  0.004176903 -0.005615171  0.25589205  0.34611912
## MDPE2F915  0.43111546 -0.132479691 -0.189948738 -0.16460255 -0.06387725
## MINE1F514 -0.10813300 -0.054487484 -0.127827478  0.35762970  0.02639579
## MINE1F714 -0.18062524 -0.186213416  0.316822686 -0.12653316  0.03401600
## MINE1F914 -0.02528633  0.190380196 -0.145853239 -0.03852416 -0.16041303
## MINE1F715 -0.16580270 -0.137093351 -0.037690616 -0.02880182  0.16812382
## MINE2F515 -0.12461981 -0.053974187 -0.128431941  0.26389711  0.17673427
## MINE2F915  0.31545737 -0.117989551 -0.191235301 -0.09072100  0.09146613
## MOTE1F514 -0.17484364 -0.111442589 -0.150322176  0.35322592 -0.17748902
## MOTE1F714 -0.13739765 -0.042859458  0.579147918 -0.17984627 -0.26019467
## MOTE1F914  0.18372613  0.368564362 -0.105190531 -0.17499813 -0.29648555
## MOTE1F715 -0.12900770 -0.082537395  0.014542652 -0.13484198  0.24872978
## MOTE2F515 -0.13328496 -0.089015640 -0.056458549  0.28200293  0.23995399
## MOTE2F915  0.43860620  0.069859900 -0.096747004 -0.19170514 -0.09375166

table(dynamic_mergedColors)

## dynamic_mergedColors
##      blue     brown     green turquoise    yellow 
##        61        38        32       315        36

sizeGrWindow(12,9)
plotDendroAndColors(hclustGeneTree, cbind(dynamicColors, dynamic_mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05)

# Rename Module Colors 
moduleColors <- dynamic_mergedColors

# Construct numerical labels corresponding to the colors 
colorOrder <- c("grey", standardColors(50))
moduleLabels <- match(moduleColors, colorOrder)-1
MEs <- mergedMEs

save(MEs, moduleLabels, moduleColors, hclustGeneTree, file = "data/WGCNA/dynamic-prodOTU-02-networkConstruction.RData")

Figure: Clustering dendrogram of genes obtained in the single-block analysis, together with module colors determined in the single-block analysis and the module colors determined in the block-wise analysis. There is excellent agreement between the single-block and block-wise network construction and module detection.

3 Relating modules to environmental traits

Relating Modules to Traits Tutorial Here

3.1 Quantifying module-trait associations

# define numbers of genes and samples
nOTUs <- ncol(datOTUClr)
nSamples <- nrow(datOTUClr)

# Recalculate MEs with color labels
MEs0 <- moduleEigengenes(datOTUClr, moduleColors)$eigengenes
MEs <- orderMEs(MEs0)

names(MEs) <- substring(names(MEs), 3)


moduleTraitCor <- cor(MEs, datContinuous, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# PLOT
sizeGrWindow(10,6)
textMatrix <- paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) <- dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))

# Display the correlation values within a heatmap
labeledHeatmap(Matrix = moduleTraitCor, 
               xLabels = names(datContinuous),
               yLabels = names(MEs), 
               ySymbols = names(MEs), 
               colorLabels = FALSE, 
               colors = blueWhiteRed(50),
               textMatrix = textMatrix, 
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait Relationships"))

Figure: Relationships of consensus module eignegenes and traits of samples collected from Muskegon Lake. Each row in the table corresponds to a consensus module (of OTUs), and each column is a trait. Numbers in the table report the correlations of the corresponding module eigengenes and traits, with a p-value printed below in parentheses. The table is color coded by correlation according to the legend.

We are interested in classifying the modules that are associated with bacterial production. So therefore, the trait of interest is tot_bacprod.

From the plot above, we can see that the following modules as significantly (pvalues in the parentheses):

Blue, correlation = -0.52

3.2 OTU relationship to trait and important modules

3.2.1 OTU significance and module membership

Goal: Quantify associations of individual OTU with trait of interest (tot_bacprod)

How?

Define OTU significance (GS, originally called gene significance) as the absolute value of the correlation between the gene and the trait.
For each module, also define a quantitative measure of module membership (MM) as the correlation of the module eigngene and the gene expression profile. Allows us to quantify the similarity of all genes on the array to every module.

# Define variable weight containing the weight column of datTrait
bacprod <- as.data.frame(datContinuous$tot_bacprod)
names(bacprod) <- "bacprod" # rename

# Names (colors of the modules
#modNames <- substring(names(MEs), 3) # Remove the first two letters in string

# Calculate the correlations between modules
geneModuleMembership <- as.data.frame(WGCNA::cor(datOTUClr, MEs, use = "p"))

# What are the p-values for each correlation?
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))

# What's the correlation for the trait: bacterial production?
geneTraitSignificance <- as.data.frame(cor(datOTUClr, bacprod, use = "p"))

# What are the p-values for each correlation?
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples = nSamples))

names(geneTraitSignificance) <- paste("GS.", names(bacprod), sep = "")
names(GSPvalue) <- paste("p.GS.", names(bacprod), sep = "")

3.3 Intramodular analysis: ID genes with high GS and MM

Using GS and MM, let’s identify genes with a high significance for bacterial production as well as high module membership in interesting modules.

# Fix the names so that they match the actual color
#names(geneModuleMembership) <- substring(names(geneModuleMembership), 3) # Remove the first two letters in string

modNames <- names(geneModuleMembership)

par(mfrow = c(2,3))  
  
# Initialize for loop
      # NEED: modNames
for (i in names(geneModuleMembership)) {
  
  # Pull out the module we're working on
  module <- i
  print(module)   
  
  # Find the index in the column of the dataframe 
  column <- match(module, modNames)
  #print(column)
  
  # Pull out the Gene Significance vs module membership of the module
  moduleGenes = moduleColors == module
  vecOTUnames = rownames(geneTraitSignificance)
  print(paste("There are ", length(vecOTUnames[moduleGenes]), " OTUs in the ", module, " module.", sep = ""))
  print(vecOTUnames[moduleGenes])
  
  # Make the plot
  verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), 
                 abs(geneTraitSignificance[moduleGenes, 1]),
                 xlab = paste("Module Membership in", module, "module"),
                 ylab = "Gene significance for bacterial production",
                 main = paste("Module membership vs. gene significnace \n"),
                 cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
}

## [1] "blue"
## [1] "There are 61 OTUs in the blue module."
##  [1] "Otu000034" "Otu000058" "Otu000067" "Otu000087" "Otu000091"
##  [6] "Otu000095" "Otu000116" "Otu000122" "Otu000127" "Otu000141"
## [11] "Otu000149" "Otu000161" "Otu000193" "Otu000232" "Otu000238"
## [16] "Otu000242" "Otu000244" "Otu000268" "Otu000269" "Otu000292"
## [21] "Otu000312" "Otu000338" "Otu000339" "Otu000358" "Otu000359"
## [26] "Otu000386" "Otu000413" "Otu000423" "Otu000438" "Otu000472"
## [31] "Otu000481" "Otu000503" "Otu000505" "Otu000519" "Otu000522"
## [36] "Otu000568" "Otu000625" "Otu000654" "Otu000673" "Otu000679"
## [41] "Otu000701" "Otu000729" "Otu000730" "Otu000753" "Otu000843"
## [46] "Otu000905" "Otu000923" "Otu000961" "Otu001066" "Otu001084"
## [51] "Otu001137" "Otu001166" "Otu001180" "Otu001266" "Otu001298"
## [56] "Otu001366" "Otu001402" "Otu001650" "Otu001931" "Otu002026"
## [61] "Otu002112"

## [1] "green"
## [1] "There are 32 OTUs in the green module."
##  [1] "Otu000033" "Otu000060" "Otu000081" "Otu000131" "Otu000176"
##  [6] "Otu000322" "Otu000332" "Otu000344" "Otu000449" "Otu000517"
## [11] "Otu000521" "Otu000593" "Otu000612" "Otu000613" "Otu000615"
## [16] "Otu000644" "Otu000665" "Otu000711" "Otu000766" "Otu000796"
## [21] "Otu000924" "Otu000938" "Otu001030" "Otu001100" "Otu001120"
## [26] "Otu001171" "Otu001210" "Otu001293" "Otu001476" "Otu001487"
## [31] "Otu001691" "Otu002041"

## [1] "brown"
## [1] "There are 38 OTUs in the brown module."
##  [1] "Otu000041" "Otu000044" "Otu000062" "Otu000126" "Otu000190"
##  [6] "Otu000195" "Otu000205" "Otu000212" "Otu000240" "Otu000246"
## [11] "Otu000267" "Otu000451" "Otu000462" "Otu000636" "Otu000637"
## [16] "Otu000670" "Otu000680" "Otu000682" "Otu000715" "Otu000743"
## [21] "Otu000782" "Otu000821" "Otu000837" "Otu000860" "Otu000878"
## [26] "Otu000888" "Otu000894" "Otu000917" "Otu001086" "Otu001136"
## [31] "Otu001158" "Otu001246" "Otu001464" "Otu001520" "Otu001574"
## [36] "Otu001602" "Otu001831" "Otu001913"

## [1] "turquoise"
## [1] "There are 315 OTUs in the turquoise module."
##   [1] "Otu000001" "Otu000005" "Otu000006" "Otu000007" "Otu000009"
##   [6] "Otu000010" "Otu000014" "Otu000016" "Otu000017" "Otu000018"
##  [11] "Otu000019" "Otu000020" "Otu000022" "Otu000023" "Otu000025"
##  [16] "Otu000026" "Otu000027" "Otu000029" "Otu000030" "Otu000038"
##  [21] "Otu000040" "Otu000042" "Otu000043" "Otu000047" "Otu000048"
##  [26] "Otu000049" "Otu000050" "Otu000057" "Otu000059" "Otu000061"
##  [31] "Otu000063" "Otu000065" "Otu000066" "Otu000068" "Otu000071"
##  [36] "Otu000073" "Otu000074" "Otu000075" "Otu000076" "Otu000078"
##  [41] "Otu000079" "Otu000080" "Otu000082" "Otu000083" "Otu000084"
##  [46] "Otu000086" "Otu000090" "Otu000093" "Otu000094" "Otu000096"
##  [51] "Otu000098" "Otu000099" "Otu000100" "Otu000101" "Otu000103"
##  [56] "Otu000105" "Otu000106" "Otu000107" "Otu000108" "Otu000109"
##  [61] "Otu000113" "Otu000115" "Otu000119" "Otu000120" "Otu000123"
##  [66] "Otu000124" "Otu000128" "Otu000132" "Otu000133" "Otu000136"
##  [71] "Otu000137" "Otu000139" "Otu000144" "Otu000147" "Otu000150"
##  [76] "Otu000151" "Otu000152" "Otu000154" "Otu000158" "Otu000159"
##  [81] "Otu000160" "Otu000163" "Otu000165" "Otu000167" "Otu000168"
##  [86] "Otu000169" "Otu000171" "Otu000172" "Otu000173" "Otu000175"
##  [91] "Otu000179" "Otu000182" "Otu000183" "Otu000184" "Otu000187"
##  [96] "Otu000189" "Otu000191" "Otu000192" "Otu000194" "Otu000196"
## [101] "Otu000203" "Otu000204" "Otu000207" "Otu000210" "Otu000213"
## [106] "Otu000216" "Otu000217" "Otu000218" "Otu000219" "Otu000221"
## [111] "Otu000222" "Otu000223" "Otu000224" "Otu000226" "Otu000227"
## [116] "Otu000230" "Otu000235" "Otu000247" "Otu000248" "Otu000249"
## [121] "Otu000253" "Otu000255" "Otu000257" "Otu000259" "Otu000261"
## [126] "Otu000262" "Otu000263" "Otu000264" "Otu000266" "Otu000271"
## [131] "Otu000273" "Otu000275" "Otu000277" "Otu000279" "Otu000283"
## [136] "Otu000284" "Otu000285" "Otu000289" "Otu000290" "Otu000291"
## [141] "Otu000294" "Otu000296" "Otu000297" "Otu000298" "Otu000302"
## [146] "Otu000305" "Otu000309" "Otu000313" "Otu000316" "Otu000317"
## [151] "Otu000319" "Otu000320" "Otu000323" "Otu000325" "Otu000328"
## [156] "Otu000330" "Otu000336" "Otu000340" "Otu000343" "Otu000346"
## [161] "Otu000349" "Otu000355" "Otu000356" "Otu000361" "Otu000373"
## [166] "Otu000377" "Otu000379" "Otu000384" "Otu000385" "Otu000388"
## [171] "Otu000389" "Otu000390" "Otu000392" "Otu000399" "Otu000400"
## [176] "Otu000402" "Otu000403" "Otu000407" "Otu000408" "Otu000409"
## [181] "Otu000410" "Otu000414" "Otu000416" "Otu000418" "Otu000421"
## [186] "Otu000422" "Otu000426" "Otu000427" "Otu000428" "Otu000429"
## [191] "Otu000434" "Otu000440" "Otu000445" "Otu000453" "Otu000456"
## [196] "Otu000458" "Otu000459" "Otu000460" "Otu000469" "Otu000470"
## [201] "Otu000471" "Otu000474" "Otu000478" "Otu000482" "Otu000487"
## [206] "Otu000494" "Otu000496" "Otu000506" "Otu000509" "Otu000511"
## [211] "Otu000514" "Otu000515" "Otu000535" "Otu000536" "Otu000539"
## [216] "Otu000547" "Otu000559" "Otu000563" "Otu000574" "Otu000578"
## [221] "Otu000581" "Otu000587" "Otu000590" "Otu000604" "Otu000605"
## [226] "Otu000614" "Otu000623" "Otu000631" "Otu000645" "Otu000651"
## [231] "Otu000655" "Otu000660" "Otu000669" "Otu000672" "Otu000681"
## [236] "Otu000708" "Otu000712" "Otu000716" "Otu000722" "Otu000725"
## [241] "Otu000745" "Otu000748" "Otu000749" "Otu000756" "Otu000764"
## [246] "Otu000770" "Otu000771" "Otu000774" "Otu000778" "Otu000779"
## [251] "Otu000787" "Otu000799" "Otu000815" "Otu000827" "Otu000832"
## [256] "Otu000833" "Otu000836" "Otu000848" "Otu000851" "Otu000865"
## [261] "Otu000885" "Otu000893" "Otu000898" "Otu000899" "Otu000900"
## [266] "Otu000904" "Otu000931" "Otu000932" "Otu000937" "Otu000977"
## [271] "Otu000985" "Otu001018" "Otu001021" "Otu001023" "Otu001059"
## [276] "Otu001067" "Otu001078" "Otu001087" "Otu001089" "Otu001124"
## [281] "Otu001132" "Otu001142" "Otu001183" "Otu001200" "Otu001232"
## [286] "Otu001233" "Otu001236" "Otu001242" "Otu001256" "Otu001267"
## [291] "Otu001273" "Otu001300" "Otu001317" "Otu001329" "Otu001330"
## [296] "Otu001368" "Otu001387" "Otu001439" "Otu001451" "Otu001452"
## [301] "Otu001473" "Otu001570" "Otu001580" "Otu001664" "Otu001672"
## [306] "Otu001697" "Otu001767" "Otu001799" "Otu001835" "Otu001846"
## [311] "Otu001959" "Otu002142" "Otu002234" "Otu002374" "Otu002481"

## [1] "yellow"
## [1] "There are 36 OTUs in the yellow module."
##  [1] "Otu000004" "Otu000011" "Otu000012" "Otu000024" "Otu000036"
##  [6] "Otu000046" "Otu000053" "Otu000064" "Otu000070" "Otu000085"
## [11] "Otu000088" "Otu000104" "Otu000112" "Otu000135" "Otu000143"
## [16] "Otu000162" "Otu000293" "Otu000335" "Otu000412" "Otu000425"
## [21] "Otu000454" "Otu000516" "Otu000523" "Otu000595" "Otu000607"
## [26] "Otu000624" "Otu000810" "Otu000857" "Otu000870" "Otu001126"
## [31] "Otu001319" "Otu001327" "Otu001345" "Otu001480" "Otu002189"
## [36] "Otu002831"

Figure : Above is a scatterplot Gene Significance (GS) for bacterial production vs. Module Membership (MM) in the each of the different modules.

Modules with a high & significant correlations:

Positive Correlations:
- blue: cor = 0.64, p = 2.8e-08
- brown: cor = 0.33, p = 0.043
- turquoise: cor = 0.17, p = 0.0025

for (i in names(geneModuleMembership)) {
  # Pull out the module we're working on
  module <- i
  # Find the index in the column of the dataframe 
  column <- match(module, modNames)
  # Pull out the Gene Significance vs module membership of the module
  moduleGenes = moduleColors == module
  vecOTUnames = rownames(geneTraitSignificance)
  print(paste("There are ", length(vecOTUnames[moduleGenes]), " OTUs in the ", module, " module.", sep = ""))
  
  # NOTE: This makes hidden variables with the OTU names: brown_OTUs, yellow_OTUs
  assign(paste(module, "_OTUs", sep = ""), vecOTUnames[moduleGenes])
}

## [1] "There are 61 OTUs in the blue module."
## [1] "There are 32 OTUs in the green module."
## [1] "There are 38 OTUs in the brown module."
## [1] "There are 315 OTUs in the turquoise module."
## [1] "There are 36 OTUs in the yellow module."

3.4 Summary output of network analysis

From the plot above, it looks like the blue module has the largest positive correlation with bacterial production.

Which bacterial taxa make up this module?

We just identified modules with high association with the trait of interest (i.e. brown & red, but not magenta or midnightblue), now let’s merge this information with gene annotation information and write out a file that summarizes the results.

# Combine pval, taxonomy, module membership, and gene significance into one dataframe

# Prepare pvalue df
GSpval <- GSPvalue %>%
  tibble::rownames_to_column(var = "OTU")

# Prepare taxonomy df
tax_df <- tax_table(prod_clr_physeq) %>%
  data.frame() %>%
  tibble::rownames_to_column(var = "OTU") %>%
  rename(Domain = Rank1, Phylum = Rank2, Class = Rank3, Order = Rank4, Family = Rank5, Genus = Rank6, Species = Rank7)

# Prepare module membership df
# Modify column names
#names(geneModuleMembership) <- substring(names(geneModuleMembership), 3)

gMM_df <- geneModuleMembership %>%
  tibble::rownames_to_column(var = "OTU") %>%
  gather(key = "moduleColor", value = "moduleMemberCorr", -OTU) 

# Prepare gene significance df
GS_bacprod_df <- geneTraitSignificance %>%
  data.frame() %>%
  tibble::rownames_to_column(var = "OTU")

# Put everything together 
allData_df <- gMM_df %>%
  left_join(GS_bacprod_df, by = "OTU") %>%
  left_join(GSpval, by = "OTU") %>%
  left_join(tax_df, by = "OTU")

# Write a file 
write.csv(allData_df, file = "data/WGCNA/allData_df_WGCNA.csv")

4 Network visualization using WGCNA functions

Visualization Tutorial Here

4.1 Visualizing OTU netowrks

# Calculate topological overlap. 
# Calculate during module detection, but calculating again here:
dissTOM <- 1-TOMsimilarityFromExpr(datOTUClr, power = 3)

## TOM calculation: adjacency..
## ..will not use multithreading.
##  Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

# Transform with a power to make moderately strong connectsion more visible in the heatmap
plotTOM <- dissTOM^7

# Set diagnol to NA for a nicer plot
diag(plotTOM) <- NA

######################## ALL GENES - even though it looks like subsampling 
nSelect = 482
nGenes = ncol(datOTUClr)
nSamples = nrow(datOTUClr)

# For reproducibility, we set the random seed
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

Figure: Visualizing the OTU network using a heatmap plot. The heatmap depicts the Topological Overlap Matrix (TOM) among all OTUs in the analysis. Light color represents low overlap and progressively darker red color represents higher overlap. Blocks of darker colors along the diagonal are the modules. The gene dendrogram and module assignment are also shown along the left side and the top.

4.2 Visualizing the network of eigengenes

# Reclaculate module eigengenes
#MEs = moduleEigengenes(datOTUClr, moduleColors)$eigengenes

# Isolate weight from the clinical traits
bacprod <- as.data.frame(datContinuous$tot_bacprod)
names(bacprod) = "bacprod"

# Add the weight to existing module eigengenes
MET <- orderMEs(cbind(MEs, bacprod))
#names(MET) <- substring(names(MET), 3)

# Plot the dendogram
sizeGrWindow(6,6)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendogram", marDendro = c(0,4,2,0),
                      plotHeatmaps = FALSE)

# Plot the heatmap 
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
                      plotDendograms = TRUE, xLabeles = 90)

Figure: Visualization of the eigengene network representing the relationships among the modules and bacterial production. Left panel shows a hierarchical clustering dendrogram of the eigengenes in which the dissimilarity of eigengenes EI , EJ is given by 1 − cor(EI , EJ ). The heatmap in the right panel shows the eigengene adjacency AIJ = (1 + cor(EI , EJ ))/2.

5 OTUs - Module Membership

library(DT)

# Module Membership
mm_df <- geneModuleMembership %>%
  tibble::rownames_to_column(var = "OTU") %>%
  mutate(blue = round(abs(blue), digits = 3), 
         green = round(abs(green), digits = 3),  
         brown = round(abs(brown), digits = 3), 
         turquoise = round(abs(turquoise), digits = 3),  
         yellow = round(abs(yellow), digits = 3)) %>%
  left_join(tax_df, by = "OTU")

# OTUs from Final figure
dplyr::filter(mm_df, OTU == "Otu000173")

##         OTU  blue green brown turquoise yellow   Domain        Phylum
## 1 Otu000173 0.411 0.175 0.176     0.496  0.052 Bacteria Bacteroidetes
##            Class            Order Family   Genus      Species
## 1 Flavobacteriia Flavobacteriales  bacII bacII-A Unclassified

dplyr::filter(mm_df, OTU == "Otu000029")

##         OTU  blue green brown turquoise yellow   Domain        Phylum
## 1 Otu000029 0.463 0.351 0.044     0.616  0.022 Bacteria Bacteroidetes
##        Class        Order Family    Genus Species
## 1 Cytophagia Cytophagales bacIII bacIII-B   Algor

# These OTUs should be in the blue modules:
dplyr::filter(mm_df, OTU %in% blue_OTUs) %>%
  datatable(options = list(pageLength = 10), caption = 'List of OTUs in blue module')

# Green OTUs
dplyr::filter(mm_df, OTU %in% green_OTUs) %>%
  datatable(options = list(pageLength = 10), caption = 'List of OTUs in green module')

# brown
dplyr::filter(mm_df, OTU %in% brown_OTUs) %>%
  datatable(options = list(pageLength = 10), caption = 'List of OTUs in brown module')

# turquoise
dplyr::filter(mm_df, OTU %in% turquoise_OTUs) %>%
  datatable(options = list(pageLength = 10), caption = 'List of OTUs in turquoise module')

# yellow
dplyr::filter(mm_df, OTU %in% yellow_OTUs) %>%  
  datatable(options = list(pageLength = 10), caption = 'List of OTUs in yellow module')

WGCNA Tutorial

Marian L Schmidt

2019 April, 26