data("airway")
<- DESeqDataSet(se = airway, design = ~ cell + dex) dds
1 Introduction
The analysis of RNA-Seq data involves two distinct parts. The first one needs the use of servers or HPC (high performance computers) and has to do with quality control. pre-processing, alignment-mapping (usually with the STAR software) and counting (can be done using RSEM software). The second one is called downstream analysis and this part involves differential gene expression, gene sets analysis, etc.
Due to the lack of facility to use a server or hpc the first part of the RNA-Seq analysis won’t be done but, you can use the rnaseq pipeline from nfcore. In my experience, the best combination is to use fastq + STAR + RSEM combination of software for this part of the analysis.
With respect to the second part of the analysis, on this document we’ll see how to analyze the airway data set which is a treatment of dexametasome on specific cell lines. Specifically, differentially gene expression, gene sets enrichment analysis and network analysis will be performed.
2 Data & Methods
We are gonna use the airway dataset from the airway
package. Part of the analysis will be: - Differential gene expression using DESeq2
. - Gene set analysis with fgsea from Cluster profiler. - Network analysis with networkx.
2.1 Data Loading
For all the analysis the Himes et al. (2014) dataset will be used. Also, the Love, Huber, and Anders (2014) package will be used for the differential gene expression step.
2.2 Pre-filtering
<- rowSums(counts(dds)>= 10) >= 3
keep
<- dds[keep,] dds
2.2.1 Re-leveling Factors
$dex <- factor(dds$dex, levels = c("untrt","trt"))
dds$dex <- relevel(dds$dex, ref = "untrt")
dds$dex <- droplevels(dds$dex) dds
2.3 Quality Control of Samples
We can see how the variance is stabilized with the rlog transformation.
<- rlog(dds)
rld
<-data.frame(Means=rowMeans(assay(rld)),
hex_dfSds=rowSds(assay(rld)))
<-hex_df |>
gghexggplot(aes(Means,Sds))+
geom_hex(alpha=0.8,bins=40)+
guides(fill=guide_colourbar(title = "Counts"))+
labs(x="Sorted mean of normalized counts per gene",
y="Standard deviation of normalized counts per gene")+
theme_minimal()+
geom_smooth(aes(Means,Sds),colour="red",linewidth=0.5) +
::scale_fill_paletteer_c("ggthemes::Green-Gold")
paletteer
::card(full_screen = T, bslib::card_title("Stabilized Variance"), plotly::ggplotly(gghex)) bslib
Stabilized Variance
Also, is important to check if appears some structure in the sample to sample distances plot.
<-estimateSizeFactors(dds)
dds<- order(rowMeans(counts(dds,normalized=TRUE)),
select decreasing=TRUE)[1:20]
<- dist(t(assay(rld)))
sampleDists
<- as.matrix(sampleDists)
sampleDistMatrix
<-heatmaply(sampleDistMatrix,
ggheatrow_side_colors = colData(rld)[,c("dex","cell")],
row_dend_left = F,colors = viridis::magma(n=256, alpha = 0.8, begin = 0))
::card(full_screen = T, bslib::card_title("Sample-Sample Distance"), ggheat) bslib
Sample-Sample Distance
2.4 Differential Gene Expression (DGE)
Let’s calculate differential expressed genes using shrinked log2 fold changes with threshold of 1 and a FDR of 0.05.
<- DESeq(dds) dds
<- results(dds, lfcThreshold = 1, alpha = 0.05, test = "Wald")
res
<- lfcShrink(dds, type = "apeglm", lfcThreshold = 1, coef = 5) res.lfc
We can see the MA plots as a general view of up and down regulated genes.
<-plotMA(res.lfc,alpha=0.05,returnData=T)
a
$symbol<-rownames(a)
a$label<-NA
a$label[a$isDE == TRUE] <- a$symbol[a$isDE == TRUE]
a
<- ggplot(a, aes(log10(mean), lfc, colour = isDE)) +
p geom_point(alpha = 0.2, size = 0.8) +
geom_hline(aes(yintercept = 0), col = "red", alpha = 0.6) +
geom_rug() +
geom_hline(yintercept = c(-1, 1), color = "midnightblue") +
scale_colour_manual(
name = paste0("FDR = ", 0.05),
values = c(mnsl("10PB 5/8"), mnsl("5BG 7/8")),
labels = c("no DE", "DE")
+
) labs(
y = "Shrunken log2foldchange",
x = "Mean of normalized counts (log10 scale)",
title = "Bland-Altman Plot"
)
p
2.4.1 Top 20 Expressed Genes
As a general view, we can select the top 20 expressed genes ordered by the number of counts.
<-heatmaply::heatmaply(assay(rld)[select,],
dfcol_side_colors=colData(rld)[,c("dex","cell")],
colors = viridis::magma(n=256, alpha = 0.8, begin = 0))
::card(df, full_screen = T) bslib
2.5 Manipulating Annotation with a SQLite DB
For this part of the analysis, is important to use annotation databases, in this case we are gonna use the human h19 genome annotation from the Carlson (2023) package.
<-keys(org.Hs.eg.db)
k
<- AnnotationDbi::select(
a
org.Hs.eg.db,keys = k,
keytype = "ENTREZID",
columns = c("SYMBOL", "GENENAME")
)
<- data.frame(
anno_df gene_id = rownames(dds),
gene_name = mapIds(
org.Hs.eg.db,keys = rownames(dds),
column = "SYMBOL",
keytype = "ENSEMBL"
),gene_entrez = mapIds(
org.Hs.eg.db,keys = rownames(dds),
keytype = "ENSEMBL",
column = "ENTREZID"
),stringsAsFactors = FALSE,
row.names = rownames(dds)
)
$SYMBOL <-
res.lfc$gene_name[match(rownames(res.lfc), anno_df$gene_id)] anno_df
2.6 Construction of Ranks for fgsea
<-
res.lfc_for_ranks %>%
res.lfcas.data.frame()%>%
::select(SYMBOL,log2FoldChange)%>%
dplyrna.omit()%>%
distinct()%>%
group_by(SYMBOL)%>%
summarize(log2FoldChange=mean(log2FoldChange))
<-
res.lfc_ranks deframe(res.lfc_for_ranks)
2.7 Data Manipulation for GeneTonic
For this part of the process, the Wu et al. (2021) package will be used to make the gene sets enrichment analysis. Is important to mention that for this analysis we are gonna analyze only the biological process ontology,
<-bitr(names(res.lfc_ranks),fromType = "SYMBOL",toType = "ENTREZID",OrgDb = org.Hs.eg.db) ranks_entrez
<-res.lfc_ranks[ranks_entrez$SYMBOL]
entrez_res.lfc_ranks
# names(res.lfc_ranks)<-ranks_entrez$ENTREZID
names(entrez_res.lfc_ranks) <- ranks_entrez$ENTREZID
<-
res.lfc_fgsea_GOclu_BP gseGO(
geneList = sort(entrez_res.lfc_ranks, decreasing = T),
OrgDb = org.Hs.eg.db,
ont = "BP",
pvalueCutoff = 1,
verbose = F
)
<-
res.lfc_fgsea_GOclu_BP setReadable(res.lfc_fgsea_GOclu_BP,
"org.Hs.eg.db",
keyType = "ENTREZID")
<-
res.lfc_fgsea_GOclu_BP_shaked shake_gsenrichResult(res.lfc_fgsea_GOclu_BP)
<-
res.lfc_fgsea_GOclu_BP_shaked_aggr get_aggrscores(
res_enrich = res.lfc_fgsea_GOclu_BP_shaked,
res_de = res.lfc,
annotation_obj = anno_df,
aggrfun = mean
)
<- GeneTonic_list(
gtl_res_GOclu_BP dds = dds,
res_de = res,
res_enrich = res.lfc_fgsea_GOclu_BP_shaked_aggr,
annotation_obj = anno_df
)
2.7.1 Summary Heatmap of Genesets
<-gs_summary_heat(gtl = gtl_res_GOclu_BP,
pn_gs=15)
p
2.7.2 GeneSets - Genes Graph
GeneTonic
package have a lot of interactive plots that we can use to dig into our data. An example is a network of Gene sets and genes from which we are gonna perform a network analysis.
# number_gs <- gtl_res_GOclu_BP$res_enrich$gs_id |> unique() |> length()
<- ggs_graph(gtl = gtl_res_GOclu_BP, n_gs = 500)
ggs <- ggs_graph(gtl = gtl_res_GOclu_BP, n_gs = 20)
ggs2
<- toVisNetworkData(ggs)
data <- toVisNetworkData(ggs2)
data2
$nodes <- data2$nodes %>%
data2mutate(group = nodetype, nodetype = NULL) %>%
::select(-color)
dplyr
<- visNetwork(nodes = data2$nodes, edges = data2$edges) %>%
p visIgraphLayout() %>%
visOptions(
highlightNearest = list(
enabled = TRUE,
degree = 1,
hover = TRUE
),nodesIdSelection = TRUE
%>%
) visPhysics(stabilization = FALSE) %>%
visGroups(groupname = "GeneSet", color = "#48ABC7") %>%
visGroups(groupname = "Feature", color = "#8BE64AD0")
::tagList(list(p)) htmltools
2.8 igraph
Object Manipulation
In this part of the analysis, we are gonna make a network analysis with Hagberg, Schult, and Swart (2008) networkx library from python. But, before that we need to manipulate the data to have a proper input.
Let’s build a bipartite undirected adjacency matrix.
\[ A = \begin{pmatrix} 0_{r,r} & B\\ B^{T} & 0_{s,s} \end{pmatrix} \]
V(ggs)$group <- data$nodes$nodetype
V(ggs)$type_bipartite <- as.integer(V(ggs)$nodetype == "Feature")
<- as.matrix(as_adjacency_matrix(ggs, sparse = FALSE)) |> as.data.frame() adj_matrix_bipartite
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
from matplotlib import pyplot as plt
Checking the degree distribution of the graph.
= r.adj_matrix_bipartite
graph
= nx.from_pandas_adjacency(graph, create_using=nx.Graph)
G
= sorted((i[1] for i in G.degree), reverse=True)
degree_sequence
sns.kdeplot(degree_sequence)
sns.rugplot(degree_sequence) plt.show()
In this plot, we can see that there are two sub graphs with more than 120 connected components. We are gonna work with these.
2.9 Gene Sets Similarity
Gene sets similarity can be thought of as the similarity between gene sets based on their shared genes. The strength of the similarity can be measured by the number of genes shared between gene sets.
From a matrix \(G_{mxn}\), with m gene sets and n genes, when can compute the gene set similarity matrix \(S_{mxm}\) (where each element \(s_{ij}\) represents the number of of shared genes between gene sets i and j) as followed:
\[ S = G*G^T \]
= r.ontology_coupling
graph
= nx.from_pandas_adjacency(graph, create_using=nx.Graph)
G G.remove_edges_from(nx.selfloop_edges(G))
= nx.degree_centrality(G)
degree_centrality ='degree_centrality')
nx.set_node_attributes(G, degree_centrality, name
= nx.betweenness_centrality(G, normalized=True)
betweenness_centrality ='betweenness_centrality')
nx.set_node_attributes(G, betweenness_centrality, name
= nx.eigenvector_centrality(G, max_iter=1000000)
eigenvector_centrality ='eigenvector_centrality')
nx.set_node_attributes(G, eigenvector_centrality, name
def graph_to_dataframe(graph):
= {node: graph.nodes[node] for node in graph.nodes()}
node_data = pd.DataFrame.from_dict(node_data, orient='index')
df return df
= graph_to_dataframe(G)
df_genesets_centralities
= nx.spring_layout(G) pos
="degree_centrality"),"Degree Centrality") draw(G, pos, nx.get_node_attributes(G,name
="betweenness_centrality"),"Betweenness Centrality") draw(G, pos, nx.get_node_attributes(G,name
="eigenvector_centrality"),"Eigenvector Centrality") draw(G, pos, nx.get_node_attributes(G,name
<- py$df_genesets_centralities
df_c
<- df_c |>
df_c arrange(desc(betweenness_centrality))
::tagList(list(DT::datatable(df_c, extensions = 'Buttons', options = list(
htmltoolsdom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel')
))))
We can sort the table by eigenvector centrality to see which gene sets are the most important in the network
<- OlsSearch("detoxification of inorganic compound", exact = T, ontology = "GO")
query <- olsSearch(query)
query
<- as(query,"Terms")
qterms
termOntology(qterms)
GO:0061687
"go"
termNamespace(qterms)
GO:0061687
"biological_process"
termLabel(qterms)
GO:0061687
"detoxification of inorganic compound"
termDesc(qterms)
GO:0061687
"Any process that reduces or removes the toxicity of inorganic compounds. These include transport of such compounds away from sensitive areas and to compartments or complexes whose purpose is sequestration of inorganic compounds."
2.10 Gene Co-involvement
Gene co-involvement can be interpreted as the co-involvement of genes in multiple gene sets. The strength of co-involvement can be measured by the number of gene sets in which two genes are both involved.
From a matrix \(G_{mxn}\), with m gene sets and n genes, when can compute the gene set similarity matrix \(C_{mxm}\) (where each element \(c_{ij}\) represents the number of of shared genes between gene sets i and j) as followed:
\[ C = G^T*G \]
<- t(adj_matrix_ontology_genes) %*% adj_matrix_ontology_genes
genes_coapprearence
<- genes_coapprearence |> as.data.frame() genes_coapprearence
= r.genes_coapprearence
graph
= nx.from_pandas_adjacency(graph, create_using=nx.Graph)
G G.remove_edges_from(nx.selfloop_edges(G))
= nx.degree_centrality(G)
degree_centrality ='degree_centrality')
nx.set_node_attributes(G, degree_centrality, name
= nx.betweenness_centrality(G, normalized=True)
betweenness_centrality ='betweenness_centrality')
nx.set_node_attributes(G, betweenness_centrality, name
= nx.eigenvector_centrality(G, max_iter=1000000)
eigenvector_centrality ='eigenvector_centrality')
nx.set_node_attributes(G, eigenvector_centrality, name
def graph_to_dataframe(graph):
= {node: graph.nodes[node] for node in graph.nodes()}
node_data = pd.DataFrame.from_dict(node_data, orient='index')
df return df
= graph_to_dataframe(G)
df_genes_centralities
= nx.spring_layout(G) pos
="degree_centrality"),"Degree Centrality") draw(G, pos, nx.get_node_attributes(G,name
="betweenness_centrality"),"Betweenness Centrality") draw(G, pos, nx.get_node_attributes(G,name
="eigenvector_centrality"),"Eigenvector Centrality") draw(G, pos, nx.get_node_attributes(G,name
<- py$df_genes_centralities
df_d
<- df_d |>
df_d arrange(desc(betweenness_centrality))
::tagList(list(DT::datatable(df_d, extensions = 'Buttons', options = list(
htmltoolsdom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel')
))))
We can sort the table by betweenness centrality to look for genes that serve as bridges or connectors within the network, facilitating the flow of information.
<- res.lfc[which(res.lfc$SYMBOL=="LEP"),] |> rownames()
query
which(rownames(res)==query),] res[
log2 fold change (MLE): dex trt vs untrt
Wald test p-value: dex trt vs untrt
DataFrame with 1 row and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000174697 116.076 3.02938 0.326588 6.21389 5.16892e-10
padj
<numeric>
ENSG00000174697 8.66499e-08
3 Conclusion
- Differential gene expression analysis was performed on airway dataset.
- Gene sets enrichment analysis was performed on airway dataset.
- Network analysis was performed on airway dataset.
References
Citation
@online{palacios bernuy2024,
author = {Palacios Bernuy, Piero},
title = {Analysis of {RNA-Seq} {Data}},
date = {2024-03-07},
langid = {en},
abstract = {This document is part of a series of the analysis of Omics
data. Especifically, here is showed how to analyze bulk RNA-Seq data
with Bioconductor packages. Also, it’s showcased how to make plots
of the RNA data in the context of differentially gene expression and
gene-sets.}
}