## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 5,
  fig.height = 5
)


## ----setup--------------------------------------------------------------------
library(COTAN)
library(data.table)
library(Matrix)
library(ggrepel)
library(factoextra)
library(Rtsne)
library(utils)
library(plotly)
library(tidyverse)
library(htmlwidgets)
library(MASS)
library(dendextend)

## -----------------------------------------------------------------------------
mycolours <- c("A" = "#8491B4B2","B"="#E64B35FF")
my_theme = theme(axis.text.x = element_text(size = 14, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
                 axis.text.y = element_text( size = 14, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
                 axis.title.x = element_text( size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
                 axis.title.y = element_text( size = 14, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"))
data_dir = tempdir()

## ----eval=FALSE, include=FALSE------------------------------------------------
#  Download the dataset for mouse cortex E17.5.
#  if (!file.exists(paste("inst/extdata/","E175_only_cortical_cells.txt.gz", sep = "/"))) {
#    download.file("ftp://ftp.ncbi.nlm.nih.gov/geo/
#                  samples/GSM2861nnn/GSM2861514/suppl/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz",
#                paste(data_dir,"E175_only_cortical_cells.txt.gz", sep = "/"),method = "wget", quiet = FALSE)
#  
#  }

## -----------------------------------------------------------------------------
data("ERCCraw", package = "COTAN")

ERCCraw = as.data.frame(ERCCraw)
rownames(ERCCraw) = ERCCraw$V1
ERCCraw = ERCCraw[,2:ncol(ERCCraw)]
ERCCraw[1:5,1:5]

## -----------------------------------------------------------------------------
out_dir <- paste0(tempdir(),"/")

## -----------------------------------------------------------------------------
obj = new("scCOTAN",raw = ERCCraw)
#obj = initRaw(obj,GEO="GSM2861514" ,sc.method="Drop_seq",cond = "mouse cortex E17.5")
obj = initRaw(obj,GEO="ERCC" ,sc.method="10X",cond = "negative ERCC dataset")

## -----------------------------------------------------------------------------
genes_to_rem = get.genes(obj)[grep('^mt', get.genes(obj))] 
cells_to_rem = names(get.cell.size(obj)[which(get.cell.size(obj) == 0)])
obj = drop.genes.cells(obj,genes_to_rem,cells_to_rem )

## -----------------------------------------------------------------------------
#t = "E17.5_cortex"
t = "ERCC"

print(paste("Condition ",t,sep = ""))
#--------------------------------------
n_cells = length(get.cell.size(object = obj))
print(paste("n cells", n_cells, sep = " "))

n_it = 1

## -----------------------------------------------------------------------------
if(!file.exists(out_dir)){
  dir.create(file.path(out_dir))
}

if(!file.exists(paste(out_dir,"cleaning", sep = ""))){   
  dir.create(file.path(out_dir, "cleaning"))
}

## -----------------------------------------------------------------------------
ttm = clean(obj)

obj = ttm$object

ttm$pca.cell.2

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
#  ttm$pca.cell.2
#  ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
#    geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
#                    nudge_y      = 0.05,
#                    nudge_x      = 0.05,
#                    direction    = "x",
#                    angle        = 90,
#                    vjust        = 0,
#                    segment.size = 0.2)+
#    ggtitle("B cell group genes mean expression")+my_theme +
#    theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 5,hjust = 0.02 ))
#  dev.off()
#  
#  if (length(ttm$cl1) < length(ttm$cl2)) {
#    to_rem = ttm$cl1
#  }else{
#    to_rem = ttm$cl2
#  }
#  n_it = n_it+1
#  obj = drop.genes.cells(object = obj,genes = c(),cells = to_rem)
#  
#  gc()
#  
#  ttm = clean(obj)
#  #ttm = clean.sqrt(obj, cells)
#  obj = ttm$object
#  
#  ttm$pca.cell.2
#  

## -----------------------------------------------------------------------------
pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle(label = "B cell group genes mean expression", subtitle = " - B group NOT removed -")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 10,hjust = 0.02 ),
        plot.subtitle = element_text(color = "darkred",vjust = - 15,hjust = 0.01 ))

dev.off()

## -----------------------------------------------------------------------------
nu_est = round(get.nu(object = obj), digits = 7)

plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))

plot.nu = plot.nu + geom_point(size = 1,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored.pdf", sep = "")))
plot.nu
dev.off()

plot.nu

## -----------------------------------------------------------------------------
nu_df = data.frame("nu"= sort(get.nu(obj)), "n"=c(1:length(get.nu(obj))))

ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1)+
    my_theme #+ ylim(0,1) + xlim(0,70)


## -----------------------------------------------------------------------------
yset = 0.4#threshold to remove low UDE cells
plot.ude <- ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1) + 
    my_theme + ylim(0,1) + xlim(0,400) +
    geom_hline(yintercept=yset, linetype="dashed", color = "darkred") +
    annotate(geom="text", x=200, y=0.25, 
             label=paste("to remove cells with nu < ",yset,sep = " "), 
             color="darkred", size=4.5)


pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
plot.ude
dev.off()

plot.ude

## -----------------------------------------------------------------------------
obj = add.row.to.meta(obj,c("Threshold low UDE cells:",yset)) 

to_rem = rownames(nu_df[which(nu_df$nu < yset),])

obj = drop.genes.cells(object = obj, genes = c(),cells =  to_rem)

## -----------------------------------------------------------------------------
ttm = clean(obj)
obj = ttm$object
ttm$pca.cell.2

## -----------------------------------------------------------------------------
pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
ggplot(nu_df, aes(x = n, y=nu)) + geom_point(colour = "#8491B4B2", size=1) +my_theme + #xlim(0,100)+
  annotate(geom="text", x=50, y=0.25, label="nothing to remove ", color="darkred")
dev.off()

## -----------------------------------------------------------------------------
nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 2,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency: last") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored_FINAL.pdf", sep = "")))
plot.nu
dev.off()

plot.nu

## -----------------------------------------------------------------------------
obj = cotan_analysis(obj,cores = 2)

## -----------------------------------------------------------------------------
obj = get.coex(obj)

# saving the structure 
saveRDS(obj,file = paste(out_dir,t,".cotan.RDS", sep = ""))

## -----------------------------------------------------------------------------
plot_GDI(obj, cond = "ERCC")

## -----------------------------------------------------------------------------
quant.p = get.GDI(obj)

head(quant.p)

## ----echo=TRUE----------------------------------------------------------------
AA=c("ERCC-00012","ERCC-00013","ERCC-00014") 
BB=c("ERCC-00016","ERCC-00017","ERCC-00019")
CC=c("ERCC-00022","ERCC-00024","ERCC-00028")

text.size = 12

quant.p$highlight = with(quant.p, ifelse(rownames(quant.p) %in% AA, "AA",
                                                 ifelse(rownames(quant.p) %in% CC,"Constitutive" ,
                                                               ifelse(rownames(quant.p) %in% BB,"BB" , "normal"))))

textdf <- quant.p[rownames(quant.p) %in% c(AA,CC,BB), ]
mycolours <- c("Con" = "#00A087FF","AA"="#E64B35FF","BB"="#F39B7FFF","normal" = "#8491B4B2")
f1 = ggplot(subset(quant.p,highlight == "normal" ), aes(x=sum.raw.norm, y=GDI)) +  geom_point(alpha = 0.1, color = "#8491B4B2", size=2.5)
GDI_plot = f1 +  geom_point(data = subset(quant.p,highlight != "normal"  ), aes(x=sum.raw.norm, y=GDI, colour=highlight),size=2.5,alpha = 0.8) +
  geom_hline(yintercept=quantile(quant.p$GDI)[4], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=quantile(quant.p$GDI)[3], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=1.5, linetype="dotted", color = "red", size= 0.5) +
  scale_color_manual("Status", values = mycolours)  +
  scale_fill_manual("Status", values = mycolours)  +
  xlab("log normalized counts")+ylab("GDI")+
  geom_label_repel(data = textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf),fill=highlight),
                   label.size = NA, 
                   alpha = 0.5, 
                   direction ="both",
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data =textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf)),
                   label.size = NA, 
                   segment.color = 'black',
                   segment.size = 0.5,
                   direction = "both",
                   alpha = 0.8, 
                   na.rm=TRUE,
                   fill = NA,
                   seed = 1234) +
  theme(axis.text.x = element_text(size = text.size, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
        axis.text.y = element_text( size = text.size, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),  
        axis.title.x = element_text( size = text.size, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
        axis.title.y = element_text( size = text.size, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"),
        legend.title = element_blank(),
        legend.text = element_text(color = "#3C5488FF",face ="italic" ),
        legend.position = "bottom") 
legend <- cowplot::get_legend(GDI_plot)
GDI_plot =GDI_plot + theme(
        legend.position = "none") 
GDI_plot


## -----------------------------------------------------------------------------
list.genes = list("Ref.col"= BB, "AA"=AA, "Const."=CC )

plot_heatmap(df_genes = list.genes,sets = c(1:3),conditions = "ERCC",dir = out_dir)

## -----------------------------------------------------------------------------
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"), condition = "ERCC",dir = out_dir, p_value = 0.05)

## -----------------------------------------------------------------------------
plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"),markers.list =c("ERCC-00084" ,"ERCC-00085" ,"ERCC-00086") ,symmetric = FALSE, condition = "ERCC",dir = out_dir, p_value = 0.05)

## -----------------------------------------------------------------------------
 get.observed.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")

## -----------------------------------------------------------------------------
get.expected.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")

## -----------------------------------------------------------------------------
# For the whole matrix
coex <- extract.coex(object = obj)
coex[1:5,1:5]

## -----------------------------------------------------------------------------
# For a partial matrix
coex <- extract.coex(object = obj,genes = c("ERCC-00017", "ERCC-00019", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031"))
head(coex)

## -----------------------------------------------------------------------------
if (file.exists(paste(out_dir,t,".cotan.RDS", sep = ""))) {
  #Delete file if it exists
  file.remove(paste(out_dir,t,".cotan.RDS", sep = ""))
}
unlink(paste(out_dir,"cleaning", sep = ""),recursive = TRUE)

print(sessionInfo())