Identification of key genes and pathways associated with osteoarthritis by single cell transcriptome

The following is the contribution of the apprentice in November

Single cell RNA sequencing of synovial fibroblasts (read.table reads the compressed tsv text file)

The article we reproduce today is a single-cell data mining article published in Medicine magazine in 2020, with the title of identification of the key gene and pathways associated with osteoarthritis via single cell RNA sequencing on synchronous fibroblasts. The link of the article is: https://journals.lww.com/md-journal/Fulltext/2020/08140/Identification_of_the_key_gene_and_pathways.81.aspx

This data set is mainly used to mine GSE109449 data. The main research goal is to explore the key genes and related pathways of bone synovial fibroblasts and osteoarthritis by means of scrna SEQ sequencing. This is reproduced in Figure 4 of this article. As shown below:

image-20210828212537989

The data link is: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109449

The data provided by the author are:

GSM2943158 single fibroblast from donor OA4 (OA4_A1)
GSM2943159 single fibroblast from donor OA4 (OA4_A10)
GSM2943160 single fibroblast from donor OA4 (OA4_A11)
GSM2943161 single fibroblast from donor OA4 (OA4_A12)
GSM2943162 single fibroblast from donor OA4 (OA4_A2)
GSM2943163 single fibroblast from donor OA4 (OA4_A3)
GSM2943164 single fibroblast from donor OA4 (OA4_A4)
GSM2943165 single fibroblast from donor OA4 (OA4_A5)
GSM2943166 single fibroblast from donor OA4 (OA4_A6)

image-20210828214028760

Data source of this article:

192 scRNA-seq files of SFs from 2 OA patients were accessed from microarray GSE109449 included in Gene Expression Omnibus (GEO) atabase (https://www.ncbi.nlm.nih.gov/geo/). Then statistical analysis was conducted with respect to gene number and expression levels.

Now let's begin to reproduce:

Create a folder data in the running environment offline, which contains two files:

image-20210829153150132

Set1 reads the expression matrix and constructs the Seurat object

# Import data to build Seurat objects----------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 
meta <- read.table("../data/GSE109449_singlecell_rnaseq_metadata.tsv.gz",
                   sep = "\t",header = T,row.names = 1)
data <- read.table("../data/GSE109449_singlecell_rnaseq_gene_counts.tsv.gz",
                       sep = "\t",header = T,row.names = 1)
#tsv format special reading method, table can read any
#Form files are distinguished by seq. Ensg 0000000003 is another form of gene expression. gene count files are generally the files we want
#This file should have both gene, Barcode and data
data[1:4, 1:4]
#                    OA4_A1   OA4_A10 OA4_A11 OA4_A12
# ENSG00000000003   2.76277 380.18400 0.00000 1.76691
# ENSG00000000005   0.00000   0.00000 0.00000 0.00000
# ENSG00000000419 271.00000   0.00000 1.01144 0.00000
# ENSG00000000457   3.93664  16.66799 4.71615 2.33886
#
fivenum(data[,1])
# Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input data.
# [1] 0.000000e+00 0.000000e+00 0.000000e+00 9.216565e+00 2.170270e+05

This data is tricky, and the ID needs to be converted:

library(clusterProfiler)
library(org.Hs.eg.db)
ids=bitr(rownames(data),'ENSEMBL','SYMBOL','org.Hs.eg.db') # Convert ENSEMBL to SYMBOL
head(ids)
same_name <- intersect(rownames(data),ids$ENSEMBL)#intersect 

data_f <- data[which(rownames(data) %in% same_name),]#Extracting intersecting data from large data frames
ids_f <- ids[which(ids$ENSEMBL %in% same_name),]#Extracting intersecting data from large data frames
ids_u <- ids_f[!duplicated(ids_f$ENSEMBL),]#ids_f ratio data_f is large, indicating that it needs to be repeated
ids_u <- ids_u[!duplicated(ids_u$SYMBOL),]#Repeat again
data_o <- data_f[rownames(data_f) %in% ids_u$ENSEMBL,]#data_o and data_f the quantity is the same, indicating data_o no duplication
rownames(data_o) <- ids_u$SYMBOL#Change row name
head(data_o)
data=data_o
data<- data[,grep("OA*",colnames(data))]#Intercept the list that begins with OA
data[1:4,1:4]
dim(data)
data['ACTB',] # Check data
data['GAPDH',] # Check data

After finishing the expression matrix, build the Seurat object

#Create object---------------------------------------------------------
sce.all=CreateSeuratObject(counts = data)
sce.all
table(sce.all@meta.data$orig.ident)
dim(sce.all) 
#[1] 24404   192

Set2 Dimensionality Reduction Clustering

sce <- sce.all
#Data preprocessing
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
               )  
# Dimension reduction processing
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# Clustering and clustering
sce <- FindNeighbors(sce, dims = 1:30)
sce
# An object of class Seurat 
# 24404 features across 192 samples within 1 assay 
# Active assay: RNA (24404 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap
res <-  c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)
sce <- FindClusters(sce, graph.name = "RNA_snn", resolution = res, algorithm = 1)
#Cluster tree visualization
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("RNA_snn_res", colnames(sce@meta.data))]) 

p2_tree <- clustree(sce@meta.data, prefix = "RNA_snn_res.") 
p2_tree
# RNA clustering data----
save(sce,file = '../data/first_sce_by_RNA.Rdata')

Set3 differential gene display

pro='../picture/DEG/'
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25, 
                              thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'DEG_sce.markers_by_cluster.csv'))
library(dplyr) 
top10 <- sce.markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)+scale_color_discrete(name = "Identity", na.translate = FALSE)
ggsave(filename=paste0(pro,'DEG_sce.markers_10_heatmap.pdf'))

image-20210829165623379

genes <- c("VCAN","MMP2","DCN","FGF7","CXCL12","CP","IGF1","C4B","C7","C3")
FeaturePlot(sce,features = genes,cols = c("green","red"))
ggsave(filename = paste0(pro,'DEG_sce.markers_10_featureplot.pdf'))

image-20210829170919113

VlnPlot(sce,features = genes)
ggsave(filename = paste0(pro,'DEG_sce.markers_10_VlnPlot.pdf'))

image-20210829171829273

So far, we have completed the drawing of article figure 4 of identification of the key gene and pathways associated with osteoarthritis via single cell RNA sequencing on synovial fibroblasts. Finally, to sum up, I want to give you a word. You have to do it on paper.

Single cell RNA sequencing of synovial fibroblasts (read.table reads the compressed tsv text file)

The article we reproduce today is a single-cell data mining article published in Medicine magazine in 2020, with the title of identification of the key gene and pathways associated with osteoarthritis via single cell RNA sequencing on synchronous fibroblasts. The link of the article is: https://journals.lww.com/md-journal/Fulltext/2020/08140/Identification_of_the_key_gene_and_pathways.81.aspx

This data set is mainly used to mine GSE109449 data. The main research goal is to explore the key genes and related pathways of bone synovial fibroblasts and osteoarthritis by means of scrna SEQ sequencing. This is reproduced in Figure 4 of this article. As shown below:

image-20210828212537989

The data link is: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109449

The data provided by the author are:

GSM2943158 single fibroblast from donor OA4 (OA4_A1)
GSM2943159 single fibroblast from donor OA4 (OA4_A10)
GSM2943160 single fibroblast from donor OA4 (OA4_A11)
GSM2943161 single fibroblast from donor OA4 (OA4_A12)
GSM2943162 single fibroblast from donor OA4 (OA4_A2)
GSM2943163 single fibroblast from donor OA4 (OA4_A3)
GSM2943164 single fibroblast from donor OA4 (OA4_A4)
GSM2943165 single fibroblast from donor OA4 (OA4_A5)
GSM2943166 single fibroblast from donor OA4 (OA4_A6)

image-20210828214028760

Data source of this article:

192 scRNA-seq files of SFs from 2 OA patients were accessed from microarray GSE109449 included in Gene Expression Omnibus (GEO) atabase (https://www.ncbi.nlm.nih.gov/geo/). Then statistical analysis was conducted with respect to gene number and expression levels.

Now let's begin to reproduce:

Create a folder data in the running environment offline, which contains two files:

image-20210829153150132

Set1 reads the expression matrix and constructs the Seurat object

# Import data to build Seurat objects----------------------------------------------------------
rm(list = ls())
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(clustree)) 
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(dplyr)) 
suppressPackageStartupMessages(library(data.table)) 
meta <- read.table("../data/GSE109449_singlecell_rnaseq_metadata.tsv.gz",
                   sep = "\t",header = T,row.names = 1)
data <- read.table("../data/GSE109449_singlecell_rnaseq_gene_counts.tsv.gz",
                       sep = "\t",header = T,row.names = 1)
#tsv format special reading method, table can read any
#Form files are distinguished by seq. Ensg 0000000003 is another form of gene expression. gene count files are generally the files we want
#This file should have both gene, Barcode and data
data[1:4, 1:4]
#                    OA4_A1   OA4_A10 OA4_A11 OA4_A12
# ENSG00000000003   2.76277 380.18400 0.00000 1.76691
# ENSG00000000005   0.00000   0.00000 0.00000 0.00000
# ENSG00000000419 271.00000   0.00000 1.01144 0.00000
# ENSG00000000457   3.93664  16.66799 4.71615 2.33886
#
fivenum(data[,1])
# Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input data.
# [1] 0.000000e+00 0.000000e+00 0.000000e+00 9.216565e+00 2.170270e+05

This data is tricky, and the ID needs to be converted:

library(clusterProfiler)
library(org.Hs.eg.db)
ids=bitr(rownames(data),'ENSEMBL','SYMBOL','org.Hs.eg.db') # Convert ENSEMBL to SYMBOL
head(ids)
same_name <- intersect(rownames(data),ids$ENSEMBL)#intersect 

data_f <- data[which(rownames(data) %in% same_name),]#Extracting intersecting data from large data frames
ids_f <- ids[which(ids$ENSEMBL %in% same_name),]#Extracting intersecting data from large data frames
ids_u <- ids_f[!duplicated(ids_f$ENSEMBL),]#ids_f ratio data_f is large, indicating that it needs to be repeated
ids_u <- ids_u[!duplicated(ids_u$SYMBOL),]#Repeat again
data_o <- data_f[rownames(data_f) %in% ids_u$ENSEMBL,]#data_o and data_f the quantity is the same, indicating data_o no duplication
rownames(data_o) <- ids_u$SYMBOL#Change row name
head(data_o)
data=data_o
data<- data[,grep("OA*",colnames(data))]#Intercept the list that begins with OA
data[1:4,1:4]
dim(data)
data['ACTB',] # Check data
data['GAPDH',] # Check data

After finishing the expression matrix, build the Seurat object

#Create object---------------------------------------------------------
sce.all=CreateSeuratObject(counts = data)
sce.all
table(sce.all@meta.data$orig.ident)
dim(sce.all) 
#[1] 24404   192

Set2 Dimensionality Reduction Clustering

sce <- sce.all
#Data preprocessing
sce <- NormalizeData(sce)
sce = ScaleData(sce, 
                features = rownames(sce),
                # vars.to.regress = c("nFeature_RNA", "percent_mito")
               )  
# Dimension reduction processing
sce <- FindVariableFeatures(sce, nfeatures = 3000)
sce <- RunPCA(sce, npcs = 30)
sce <- RunTSNE(sce, dims = 1:30)
sce <- RunUMAP(sce, dims = 1:30)
# Clustering and clustering
sce <- FindNeighbors(sce, dims = 1:30)
sce
# An object of class Seurat 
# 24404 features across 192 samples within 1 assay 
# Active assay: RNA (24404 features, 3000 variable features)
# 3 dimensional reductions calculated: pca, tsne, umap
res <-  c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)
sce <- FindClusters(sce, graph.name = "RNA_snn", resolution = res, algorithm = 1)
#Cluster tree visualization
colors <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
names(colors) <- unique(colnames(sce@meta.data)[grep("RNA_snn_res", colnames(sce@meta.data))]) 

p2_tree <- clustree(sce@meta.data, prefix = "RNA_snn_res.") 
p2_tree
# RNA clustering data----
save(sce,file = '../data/first_sce_by_RNA.Rdata')

Set3 differential gene display

pro='../picture/DEG/'
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25, 
                              thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'DEG_sce.markers_by_cluster.csv'))
library(dplyr) 
top10 <- sce.markers %>% group_by(cluster) %>% top_n(5, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)+scale_color_discrete(name = "Identity", na.translate = FALSE)
ggsave(filename=paste0(pro,'DEG_sce.markers_10_heatmap.pdf'))

image-20210829165623379

genes <- c("VCAN","MMP2","DCN","FGF7","CXCL12","CP","IGF1","C4B","C7","C3")
FeaturePlot(sce,features = genes,cols = c("green","red"))
ggsave(filename = paste0(pro,'DEG_sce.markers_10_featureplot.pdf'))

image-20210829170919113

VlnPlot(sce,features = genes)
ggsave(filename = paste0(pro,'DEG_sce.markers_10_VlnPlot.pdf'))

image-20210829171829273

So far, we have completed the drawing of article figure 4 of identification of the key gene and pathways associated with osteoarthritis via single cell RNA sequencing on synovial fibroblasts. Finally, to sum up, I want to give you a word. You have to do it on paper.

Posted on Sun, 05 Dec 2021 06:40:45 -0500 by domerdel