01_snv_analysis.Rmd

---
title: "CLiP postprocessing - deletion/SNV based"
author: "Katharina Hembach, updated by Izaskun Mallona"
date: "16.08.2021"
output: 
  html_document:
    toc: true
    toc_float: true
---
  
```{r, echo = FALSE, warning = FALSE, message = FALSE}
knitr::opts_chunk$set(autodep = TRUE, cache = FALSE,
                      cache.lazy = FALSE,
                      dev = "png",
                      dev.args = list(png = list(type = "cairo")),
                      warning = FALSE, message = FALSE)
```


### Load packages

```{r, message = FALSE}
## library(here)
library(Rsamtools)
library(rtracklayer)
library(dplyr)
library(BSgenome.Hsapiens.UCSC.hg38)
library(eulerr)
library(ggplot2)
library(GenomicFeatures)
library(ggrepel)
library(ggpubr)
library(stringr)
library(RColorBrewer)
library(GenomicAlignments)
library(memes)
```

```{r}
WD <- file.path('/home', 'imallona', 'polymenidou_manu_clip/', 'armor_output')
```

# Load data
```{r load-files}
samples <- c("WT",  "6M", "RBDm")
bams <- lapply(samples, function(x) 
  list.files(file.path(WD, "BAM_deduplicated_clip5"), 
             pattern = paste0("20200123.A-", x, "2*_R2_deduplicated.bam$")))
names(bams) <- samples

## reference genome
genome <- BSgenome.Hsapiens.UCSC.hg38
seqlevelsStyle(genome) <- 'UCSC' ## "Ensembl"
## gene annotations
gtf_file <- file.path(WD, "..", "reference", "gencode.v39.annotation.gtf")
gtf <- import(gtf_file)
table(gtf$type)
genes <- gtf[gtf$type =="gene"]
```

# Compute pileup

We generate genomewide pileup tables for each bam file. The pileup is used to identify the location and frequency of mutations (including deletions).

```{r pileup-mut-table}

if (!file.exists('pu_res.rds')) {
    pu_res <- lapply(samples,function(x) NULL)  %>% setNames(samples)
    min_allel_depth <- 1 ## at least 1 read with deletion per sample

    for(s in samples){
                                        # sbp <- ScanBamParam(which = mp[[s]])
        ## require at least 3 reads with any mutation per position
        ## show all mutations with at least 1 supporting read
        pp <- PileupParam(max_depth = 30000, include_deletions = TRUE, 
                          include_insertions=FALSE,
                          min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth)
        ## compute pileup for both of the replicates
        
        bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]), 
                           yieldSize=5e4, 
                           index = file.path(WD, "BAM_deduplicated_clip5", 
                                             paste0(bams[[s]][1], ".bai"))))
        
        pu <- list("rep1" = data.frame(), "rep2" = data.frame())
        repeat {
            res <- pileup(bf)
            if(nrow(res) == 0L)
                break
            res <- res %>% dplyr::filter(nucleotide == "-")
            pu[["rep1"]] <- rbind(pu[["rep1"]], res)
        }
        close(bf)
        
        bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]), 
                           yieldSize=5e4,
                           index = file.path(WD, "BAM_deduplicated_clip5", 
                                             paste0(bams[[s]][2], ".bai"))))
        
        repeat {
            res <- pileup(bf)
            if(nrow(res) == 0L)
                break
            res <- res %>% dplyr::filter(nucleotide == "-")
            pu[["rep2"]] <- rbind(pu[["rep2"]], res)
        }
        close(bf)
        lapply(pu, dim)

        pu_res[[s]] <- pu
    }

    saveRDS(pu_res, 'pu_res.rds')
} else {
    pu_res <- readRDS('pu_res.rds')
}
```

We compute the fraction of reads with deletion, and remove positions with >50% deletions, these are either true deletions compared to the reference genome or positions with very low read coverage.

```{r read-genome-coverage}
## import BAM files to get genome read coverage
if (!file.exists( "clip5_BAM_coverage.rds")) {
  fs <- dir(path = file.path(WD, "BAM_deduplicated_clip5"), pattern = "_R2_deduplicated.bam$", 
            full.names = TRUE)
  names(fs) <- str_extract(basename(fs), 
                         "(?<=20200123.A-).*(?=_R2_deduplicated.bam)")
  cov <- list()
  for (sample in names(fs)){
    ga <- readGAlignments(fs[sample])
    cov[[sample]] <- coverage(ga)
    rm(ga)
  }
  gc()
  saveRDS(cov,  "clip5_BAM_coverage.rds")
} else{
  cov <- readRDS("clip5_BAM_coverage.rds")
}
```

```{r add-del-perc}
lapply(pu_res, function(x) lapply(x, nrow))

pu_res_cov <- list() 
for(i in names(pu_res)){
  m <- match(i, names(cov))
  a <- pu_res[[i]]
  a[["rep1"]]$nreads <- cov[[m]][GRanges(seqnames = a[["rep1"]]$seqnames, 
                               range = IRanges(start = a[["rep1"]]$pos, 
                                               end = a[["rep1"]]$pos), 
                               strand = a[["rep1"]]$strand)] %>% 
    unlist %>% as.vector 
  a[["rep1"]]$del_frac <- a[["rep1"]]$count / a[["rep1"]]$nreads 
  a[["rep2"]]$nreads <- cov[[m+1]][GRanges(seqnames = a[["rep2"]]$seqnames, 
                               range = IRanges(start = a[["rep2"]]$pos, 
                                               end = a[["rep2"]]$pos), 
                               strand = a[["rep2"]]$strand)] %>% 
    unlist %>% as.vector
  a[["rep2"]]$del_frac <- a[["rep2"]]$count / a[["rep2"]]$nreads 
  pu_res_cov[[i]] <- a
}
lapply(pu_res_cov, function(x) lapply(x, function(y) summary(y$del_frac)))
## fraction of positions with > 50% deletions
lapply(pu_res_cov, function(x) lapply(x, function(y) sum(y$del_frac > 0.5)/nrow(y)))
## what is the number of reads at these positions?
lapply(pu_res_cov, function(x) lapply(x, function(y) y[y$del_frac > 0.5, "nreads"] %>% summary))

## remove all positions with > 50% deletions
pu_res_cov <- lapply(pu_res_cov, function(x) lapply(x, function(y) y[y$del_frac <= 0.5,]))
lapply(pu_res_cov, function(x) lapply(x, nrow))
```


```{r deletion-pos}
# ## check the number of reads supporting each deletion in the two replicates and filter positions
# ## that only have deletions in one replicate but not the other!lapply(pu_res,nrow)
pu_res_cov[["RBDm"]][["rep1"]] %>% head
pu_res_cov[["RBDm"]][["rep2"]] %>% head

pu_merge <- lapply(pu_res_cov, function(x){
  x[["rep1"]] %>% 
  full_join(x[["rep2"]], by = c("seqnames", "pos", "strand", "nucleotide"), 
            suffix = c("1", "2")) 
  })
lapply(pu_merge, nrow)

for(s in names(pu_merge)){
  print(s)
  print(paste0("number of positions: ", nrow(pu_merge[[s]])))
  ## positions with deletions in both replicates
  print(paste0("deletion in both reps: ", 
          pu_merge[[s]] %>% 
            dplyr::filter(!is.na(count1) & !is.na(count2)) %>% nrow))
  ## positions with deletions supported by at least 2 reads in one of the replicates
  print(paste(">=2 reads in one rep: ", 
          pu_merge[[s]] %>% dplyr::filter((count1 > 1 & !is.na(count2)) |
                                            (!is.na(count1) & count2 > 1)) %>% nrow))
  ## positions with deletions supported by at least 2 reads in both replicates
  print(paste0(">=2 reads in both reps: ", 
          pu_merge[[s]] %>% dplyr::filter(count1 > 1 & count2 > 1) %>% nrow))
  
  ## positions with at least 4 deletions independent of replicates
  print(paste0(">=4 reads: ", 
          pu_merge[[s]] %>% dplyr::filter(count1  + count2 >= 4) %>% nrow))
 
}
```

## Filter common deletion sites

```{r filter-mut-pos}
## we keep the common deletion positions with at least 2 reads in one replicate
pu_filtered <- lapply(pu_merge, function(x) 
  x %>% dplyr::filter((count1 > 1 & !is.na(count2)) |
                        (!is.na(count1) & count2 > 1)))
lapply(pu_filtered, nrow)

## sum of reads supporting a deletion 
lapply(pu_filtered, function(x) summary(x$count1 + x$count2))
```


## Positions with high number of deletions
What are the positions with many deletions? Are they common between samples?
```{r pos-high-del-count}
## how many positions have more than 100 reads?
lapply(pu_filtered, function(x) x[rowSums(x[,c("count1", "count2")]) >= 25,])
```


# Cluster deletions

We cluster the mutations that are at most 10 bp apart.

```{r group-mutations}
## create GRanges with deletion positions
gr <- lapply(pu_filtered, function(x) 
  GRanges(seqnames = x$seqnames, range = IRanges(start = x$pos, end = x$pos), 
          strand = x$strand, count1 = x$count1, count2 = x$count2))

## save deletions positions
lapply(names(gr), function(x) 
  export(gr[[x]], paste0("deletion_pos_min_3reads_", x, ".bed")))

## merge all mutations which are at max 10 bp apart
max_gap <- 10

del_cluster <- lapply(gr, function(x) reduce(x, min.gapwidth=max_gap + 1))
lengths(del_cluster)

lapply(names(del_cluster), function(x) 
  export(del_cluster[[x]], paste0("deletion_cluster_dist_", max_gap, 
                                       "_min_3reads_", x, ".bed")))
```
 
# Annotate clusters

We annotate each deletion cluster with the overlapping gene(s).


```{r annotate-clusters, warning = FALSE, eval = TRUE}
## kathi: remove deletion sites on chromosome patches
del_cluster <- lapply(del_cluster, function(x) 
  x[seqnames(x) %in% seqnames(genome)])
del_cluster <- lapply(del_cluster, function(x) {
 seqlevels(x) <- seqlevelsInUse(x)
 x})
```


```{r}
## each xl get's a number as ID
del_cluster <- lapply(del_cluster, function(x){
  x$ID <- 1:length(x)
  x
})

clus_an <- lapply(names(del_cluster), function(x) {
  olap <- findOverlaps(del_cluster[[x]], genes, ignore.strand = FALSE)  
  data.frame(cluster_id = del_cluster[[x]][queryHits(olap)]$ID, 
             gene_id = genes[subjectHits(olap)]$gene_id)
})
names(clus_an) <- names(del_cluster)
lapply(clus_an, nrow)
## number of target genes
lapply(clus_an, function(x) x$gene_id %>% unique %>% length)
```


## Filter target genes based on cluster location

We filter out protein coding genes and spliced lncRNAs where all clusters overlap with small RNAs, because we saw in IGV that all these examples show now evidence of actual binding to the long genes.


```{r identify-wrong-pc-target-genes}
to_filter <- lapply(names(clus_an), function(s) {
  x <- clus_an[[s]]
  g <- genes[genes$gene_id %in% x$gene_id]
  ##seperate protein coding and long lncRNAs from short genes
  pc <- g[g$gene_type == "protein_coding"]$gene_id
  ## separate spliced from unspliced lncRNAs
  lnc <- g[g$gene_type == "lncRNA"]$gene_id
  g1 <- gtf[gtf$gene_id %in% lnc]
  e <- g1[g1$type == "exon"]
  sp <- split(e$exon_number, e$gene_id)
  ## all lncRNAs with more than 1 exon
  lnc_spl <- names(sp)[lengths(lapply(sp, unique)) > 1]
  ## all spliced genes, that need to be checked for potential false positives
  pc <- c(pc, g[g$gene_id %in% lnc_spl]$gene_id)
  ## list of small RNAs that might cause the clusters in the long genes
  ## ## except TEC, MT_rRNA, 
  ## everything except the protein coding genes and pseudogenes
  short <- g[!g$gene_type %in% c("protein_coding", "processed_pseudogene",
                                    "unprocessed_pseudogene", "ribozyme",
                                    "rRNA_pseudogene", "Mt_rRNA", 
                                    "transcribed_processed_pseudogene", 
                                    "transcribed_unitary_pseudogene", 
                                    "transcribed_unprocessed_pseudogene", 
                                    "unitary_pseudogene",
                                    "unprocessed_pseudogene", "TEC")]
  ## remove the long lncRNAs
  short <- short[!short$gene_id %in% pc]
  ## go through  the list of genes and determine if all clusters overlap small RNAs
  res <- lapply(pc, function(i) {
    clusters <- x[x$gene_id ==i, "cluster_id"]
    nr_olap <- countOverlaps(del_cluster[[s]][del_cluster[[s]]$ID %in% clusters], short,
                             ignore.strand = FALSE)
    if(all(nr_olap > 0)){ ## all clusters overlap at least one short RNA
      return(i)
    } 
    return(NULL)
  })
  print(unique(unlist(res))) ## all genes that are no true targets
})
names(to_filter) <- names(clus_an)
lengths(to_filter)
```

```{r filter-target-genes}
# what are the filtered genes?
lapply(to_filter, function(x){
  g <- genes[genes$gene_id %in% x]
  m <- match(x, g$gene_id)
  data.frame(gene_id = x, gene_name = g[m]$gene_name, 
             gene_type = g[m]$gene_type )
  })
lapply(to_filter, function(x) genes[genes$gene_id %in% x]$gene_name)

## remove the wrong targets from the list
nam <- names(clus_an)
clus_an <- lapply(names(clus_an), function(x) {
  clus_an[[x]] %>% dplyr::filter(!gene_id %in% to_filter[[x]])
})
names(clus_an) <- nam

## number of target genes after filtering
lapply(clus_an, function(x) {
  length(unique(x$gene_id))})

saveRDS(clus_an, "del_clus_an.rds")
```

# Target genes

We need a table with all target genes and the number of clusters per gene.

```{r gdf}
## data.frame with all target genes
gdf <- as.data.frame(genes[genes$gene_id %in% 
    (lapply(clus_an, function(x) x$gene_id %>% unique) %>% unlist %>% unique)]) %>% 
  dplyr::select(seqnames, start, end, strand, gene_id, gene_name, gene_type)
  
## add the number of clusters per group
for(i in names(clus_an)){
  clus_count <- clus_an[[i]] %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::summarise(n = n())
  m <- match(gdf$gene_id, clus_count$gene_id)
  cname <- paste0("nclusters_", i)
  gdf[,cname] <- clus_count$n[m]
  gdf[is.na(gdf[,cname]), cname] <- 0
}

## sort according to number of clusters
gdf <- gdf %>% dplyr::arrange(desc(nclusters_WT))
gdf %>% head
gdf %>% dplyr::arrange(desc(nclusters_6M)) %>% head
gdf %>% dplyr::arrange(desc(nclusters_RBDm)) %>% head

## distribution of gene types
gdf$gene_type %>% table
## we remove pseudogenes
gdf <- gdf %>% dplyr::filter(!gene_type %in% 
    c("processed_pseudogene", "rRNA_pseudogene", 
      "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", 
      "transcribed_unprocessed_pseudogene", "unitary_pseudogene",
      "unprocessed_pseudogene", "TEC"))
gdf$gene_type %>% table
nrow(gdf)

## genes with 6M peaks but 0 WT peak
gdf %>% dplyr::arrange(desc(nclusters_6M)) %>% 
  dplyr::filter(nclusters_WT == 0) %>% 
  head

## bound in WT and RBDm, but not 6M
gdf %>% dplyr::filter(nclusters_6M == 0 & nclusters_WT > 0 & nclusters_RBDm > 0) %>% 
  head

write.table(gdf, "gene_deletion_cluster_count.txt",
            quote = FALSE, sep = "\t", row.names = FALSE)
```

```{r}
table(gdf$gene_type)
```

## Venn diagram

Venn and euler diagram of the genes with at least one peak

```{r venn-diagram}
gene_list <- list("WT" = gdf$gene_id[gdf$nclusters_WT>0], 
                  "6M" = gdf$gene_id[gdf$nclusters_6M>0], 
                  "RBDm" = gdf$gene_id[gdf$nclusters_RBDm>0])
euler_diag <- euler(gene_list, shape = "circle")
venn_diag <- venn(gene_list)

eulerr_options(labels = list(fontsize = 20), 
               quantities = list(fontsize = 20, font = 2), 
               fills = list(alpha = 0.5),
               padding = unit(0.6, "lines"))
p <- plot(euler_diag, font=1, 
     fills=c("#117733", "#882255", "steelblue3"),
     edges=c("#117733", "#882255", "steelblue3"),
     labels = list(col = c("#117733", "#882255", "steelblue4")),
     quantities = TRUE,
     alpha=0.6, lwd = 4, adjust_labels = FALSE)
p
svg("euler.svg"); p; dev.off()

p <- plot(venn_diag,font=1, 
     fills=c("#117733", "#882255", "steelblue3"),
     edges=c("#117733", "#882255", "steelblue3"),
     labels = list(col = c("#117733", "#882255", "steelblue4")),
     quantities = TRUE,
     alpha=0.6, lwd = 4, adjust_labels = FALSE)
p
svg("venn.svg"); p; dev.off()
```

What are the genes with clusters in only the 6M or RBDm samples?

```{r unique_clusters}
## 6M specific
gdf %>% dplyr::filter(nclusters_WT == 0 & nclusters_RBDm == 0) %>% dplyr::pull(gene_name)

## RBDm specific
gdf %>% dplyr::filter(nclusters_WT == 0 & nclusters_6M == 0) %>% dplyr::pull(gene_name)
```

What is the mean number of clusters in the unique and shared target genes?
```{r}
gdf %>% dplyr::filter(nclusters_WT == 0 & nclusters_RBDm == 0) %>% 
  dplyr::pull(nclusters_6M) %>% summary
gdf %>% dplyr::filter(nclusters_WT == 0 & nclusters_6M == 0) %>% 
  dplyr::pull(nclusters_RBDm) %>% summary

## How does it compare to the overall number of mutation clusters per gene?
gdf %>% dplyr::filter(nclusters_6M > 0 ) %>% dplyr::pull(nclusters_6M) %>% summary
gdf %>% dplyr::filter(nclusters_RBDm > 0 ) %>% dplyr::pull(nclusters_RBDm) %>% summary
```
Most unique genes have only one cluster and some two clusters.


# Hexamer enrichment in window surrounding deletion clusters

We center a window on the deletion clusters and compute the oligomer enrichment.


```{r addutrinfotogtf}

txdb1 <- makeTxDbFromGRanges(gtf)
utr3 <- threeUTRsByTranscript(txdb1, use.names=TRUE)

```

```{r prepare-annotation}
# genes <- mp_an[["RBDm"]]$gene_id
prep_an <- function(gtf, genes){
  g <- gtf[gtf$gene_id %in% genes]
  
  exon <- g[g$type == "exon"] %>% unique
  ## utr <- g[g$type == "UTR"] %>% unique

  anno <- GRangesList(exon = exon_unique, utr = utr)
  
  ## intron annotation
  txdb <- makeTxDbFromGRanges(g)
  introns <- unlist(intronsByTranscript(txdb))
  five_utr <- unlist(fiveUTRsByTranscript(txdb))
  three_utr <- unlist(threeUTRsByTranscript(txdb))

  ## We remove all 3' and 5' UTR regions that overlap with any exons
  exon_utr <- GenomicRanges::setdiff(exon, three_utr)
  exon_unique <- GenomicRanges::setdiff(exon_utr, five_utr) %>% unique
  
  # We remove 3'UTR regions that overlap with 5'UTR regions
  three_utr_unique <- GenomicRanges::setdiff(three_utr, five_utr) %>% unique

  anno <- GRangesList(exon = exon_unique, three_prime_utr = three_utr_unique, 
                      five_prime_utr = five_utr)
  
  ## remove the intronic parts that overlap with exons from other transcripts
  anno[["intron"]] <- GenomicRanges::setdiff(introns, c(anno[["exon"]], 
                                         anno[["three_prime_utr"]], 
                                         anno[["five_prime_utr"]])) %>% reduce
  ## reduce potentially overlapping ranges
  lapply(anno, reduce)
}
```

```{r}
sample_an <- lapply(names(clus_an), function(n) 
  prep_an(gtf, clus_an[[n]]$gene_id %>% unique))
names(sample_an) <- names(clus_an)

lapply(sample_an, function(x) lapply(x, length))
saveRDS(sample_an, "sample_an_deletion_clusters.rds")
```


```{r z-score-function-def}
## for each sample
## we place the window at a random position within each bg seq
## we compute the oligomer counts
## we repeat this 100 times, to generate a matrix with oligomer x 100 (oligomer count for each repetition)
shuffled_oligomer_counts <- function(b, w_size, nolig = 6){
  ## only keep the region that are at least as big as w_size
  b <- b[lengths(b) >= w_size]
  names(b) <- 1:length(b)
  # random number between 0 and 1 is multiplied with sequence length to determine the start position of the shuffled window
  starts <- floor(1 + runif(length(b)) * (lengths(b)-w_size + 1))
  # subset the sequences according to random start coordinates
  seq <- subseq(b, start = starts, end = starts + w_size - 1)
  of <- oligonucleotideFrequency(seq, width = nolig, step = 1, 
                                  simplify.as = "collapsed")
}

## given window size, peaks with position of peak center and overlapping annotation regions,
## we can compute the oligomer occurrence in the peak windows
## and we shuffle the windows in the annotation regions 100 timnes and 
## also count the oligomers to compute the x-score per oligomer 
oligomer_zscore <- function(wind_size = 41, clusters, sample_an, nolig = 6){
  half_wind <- (wind_size-1)/2

  ## Peak window regions
  wind <- lapply(clusters, function(x) 
    GRanges(seqnames(x), IRanges(start(x)-half_wind, end(x)+half_wind), 
            strand = strand(x), ID = x$ID))
  
  ## Annotation regions overlapping with at least half of a window
  sample_an_wind <- lapply(names(sample_an), function(x) 
    lapply(sample_an[[x]], function(a) {
      a[queryHits(findOverlaps(a, wind[[x]], minoverlap = half_wind+1))]
    })
  )
  names(sample_an_wind) <- names(sample_an)
  
  ## genomic sequence of windows and annotation regions
  wind_seq <- lapply(wind, function(x) getSeq(genome, x))
  bg_seq <- lapply(sample_an_wind, function(x) 
    GRangesList(x) %>% 
      unlist %>% 
      getSeq(x = genome, names = .))
  
  ## oligomer count in window
  obs <- lapply(wind_seq, function(x) {
    oligonucleotideFrequency(x, width = nolig, step = 1, 
                                    simplify.as = "collapsed")
  })
  
  ## oligomer count in shuffled windows
  obs_sh <- lapply(names(bg_seq), function(x) {
    obs_sh <- list()
    for(i in 1:100){
      obs_sh[[i]] <- shuffled_oligomer_counts(b = bg_seq[[x]], 
                                              w_size = wind_size, nolig)
    }
    obs_sh <- bind_cols(obs_sh)
    obs_sh
  })
  names(obs_sh) <- names(bg_seq)
  
  ## z-score
  sh_params <- lapply(obs_sh, function(x) {
      data.frame(mean = rowMeans(x), sd = apply(x, 1, sd))
  })
  
  z_scores <- lapply(names(obs), function(x) {
    (obs[[x]] - sh_params[[x]]$mean) / sh_params[[x]]$sd
  })
  names(z_scores) <- names(obs)
  z_scores
}

## Plot the z-scores of two peak sets against each other 
plot_oligomer <- function(dfz, xparam, yparam, i, label = "oligomer"){
  lim <- c(min(dfz[, xparam[i]], dfz[, yparam[i]]), max(dfz[, xparam[i]], dfz[, yparam[i]]))
  p <- ggplot(dfz, aes_string(x = xparam[i], y = yparam[i], label = label)) + 
    geom_point(alpha = 0.3, col = "darkblue") +
    theme_bw() + xlim(lim) + ylim(lim) + theme(aspect.ratio = 1) +
    stat_cor(method = "pearson", label.x.npc = "center", label.y.npc = "top") +
    geom_smooth(method="lm", se = TRUE, color = "darkgrey") +
    geom_text_repel(data = rbind(dfz %>% 
                      dplyr::arrange(desc(get(xparam[i]))) %>% 
                      dplyr::slice(1:20),
                      dfz %>% 
                      dplyr::arrange(desc(get(yparam[i]))) %>% 
                      dplyr::slice(1:20)) %>% 
                      unique) 
  print(p)
}
```


# Z-score for different oligomer and window sizes

## Pentamer & window 61

What are the pentamers with the highest z-score per sample?

```{r wind-61, warning = FALSE, message = FALSE}
z_scores <- oligomer_zscore(wind_size = 61, del_cluster, sample_an, nolig = 5)
lapply(z_scores, summary)
lapply(z_scores, function(x) x[order(x)][1:20])
lapply(z_scores, function(x) x[order(x, decreasing = TRUE)][1:30])
lapply(z_scores, function(x) x["GTGTG"])
lapply(z_scores, function(x) x["TGTGT"])
```
Are the top pentamers similar between samples?

### Scatterplot {.tabset}

```{r z-score-plots-61, results = "asis", message = FALSE}
dfz <- data.frame(pentamer = names(z_scores[[1]]), 
                  zscore_WT = z_scores[["WT"]],
                  zscore_6M = z_scores[["6M"]], 
                  zscore_RBDm = z_scores[["RBDm"]])
xparam <- c("zscore_6M", "zscore_6M", "zscore_RBDm")
yparam <- c("zscore_WT", "zscore_RBDm", "zscore_WT")

for(i in 1:length(xparam)){
  cat("#### ", xparam[i], " vs. ",  yparam[i], "\n")
  plot_oligomer(dfz, xparam, yparam, i, label = "pentamer")
  cat("\n\n")
}
```

## Hexamer & window 41

What are the hexamers with the highest z-score per sample?

```{r hexamer-wind-41, warning = FALSE, message = FALSE}
z_scores <- oligomer_zscore(wind_size = 41, del_cluster, sample_an, nolig = 6)
lapply(z_scores, summary)
lapply(z_scores, function(x) x[order(x)][1:20])
lapply(z_scores, function(x) x[order(x, decreasing = TRUE)][1:30])
lapply(z_scores, function(x) x["GTGTGT"])
lapply(z_scores, function(x) x["TGTGTG"])
```
Are the top hexamers similar between samples?

### Scatterplot {.tabset}

```{r hexamer-z-score-plots-41, results = "asis", message = FALSE}
dfz <- data.frame(hexamer = names(z_scores[[1]]), 
                  zscore_WT = z_scores[["WT"]],
                  zscore_6M = z_scores[["6M"]], 
                  zscore_RBDm = z_scores[["RBDm"]])
xparam <- c("zscore_6M", "zscore_6M", "zscore_RBDm")
yparam <- c("zscore_WT", "zscore_RBDm", "zscore_WT")

for(i in 1:length(xparam)){
  cat("#### ", xparam[i], " vs. ",  yparam[i], "\n")
  plot_oligomer(dfz, xparam, yparam, i, label = "hexamer")
  cat("\n\n")
}
```
 

### Histogram and violin plot

z-score vs. hexamer count.

```{r histogram-violin-plot}
wind_size <- 41
half_wind <-   half_wind <- (wind_size-1)/2
nolig <- 6
  
## Peak window regions
wind <- lapply(del_cluster, function(x) 
  GRanges(seqnames(x), IRanges(start(x)-half_wind, end(x)+half_wind), 
          strand = strand(x), ID = x$ID))
wind_seq <- lapply(wind, function(x) getSeq(genome, x))

## oligomer count in window
obs <- lapply(wind_seq, function(x) {
  oligonucleotideFrequency(x, width = nolig, step = 1, 
                                  simplify.as = "collapsed")
})

## add hexamer counts to table
dfh <- dfz %>% tidyr::pivot_longer(-hexamer, names_to = "sample", 
                                   names_prefix = "zscore_", 
                                   values_to = "zscore") %>%
  dplyr::left_join(data.frame(hexamer = c(names(obs[["WT"]]), 
                                          names(obs[["6M"]]), 
                                          names(obs[["RBDm"]])),
                              count = c(obs[["WT"]], obs[["6M"]], 
                                        obs[["RBDm"]]),
                              sample = c(rep("WT", length(obs[["WT"]])),
                                         rep("6M", length(obs[["6M"]])),
                                         rep("RBDm", length(obs[["RBDm"]])))), 
                   by = c("hexamer", "sample")) %>%
  dplyr::mutate(sample = factor(sample, level = c("WT", "6M", "RBDm")))

dfh[dfh$hexamer =="GTGTGT",]
dfh[dfh$hexamer =="TGTGTG",]

p <- dfh %>% ggplot(aes(x = zscore, color = sample, fill = sample)) + 
  geom_histogram(bins = 200, alpha = 0.6) +
  theme_bw() + 
  facet_wrap(~sample, ncol = 1, scales = "free")
  
p2 <- dfh %>% ggplot(aes(x = sample, y = zscore)) + 
  geom_violin() + 
  geom_boxplot(width=0.1) + 
  theme_bw() + 
  geom_text_repel(data = dfh %>% dplyr::filter(hexamer %in% c("GTGTGT", "TGTGTG")),
             aes(label = hexamer),
             min.segment.length = 0, segment.color = "grey50",
             box.padding = 0.5, xlim = c(1.1, NA)) + 
  facet_wrap(~sample, nrow = 1, scale = "free_x") + 
  theme(strip.background = element_blank(), strip.text.x = element_blank())
p
p2
```
 
 
## Hexamer & window 61

What are the hexamers with the highest z-score per sample?

```{r hexamer-wind-61, warning = FALSE, message = FALSE}
z_scores <- oligomer_zscore(wind_size = 61, del_cluster, sample_an, nolig = 6)
lapply(z_scores, summary)
lapply(z_scores, function(x) x[order(x)][1:20])
lapply(z_scores, function(x) x[order(x, decreasing = TRUE)][1:30])
lapply(z_scores, function(x) x["GTGTGT"])
lapply(z_scores, function(x) x["TGTGTG"])
```
Are the top hexamers similar between samples?

### Scatterplot {.tabset}

```{r hexamer-z-score-plots-61, results = "asis", message = FALSE}
dfz <- data.frame(hexamer = names(z_scores[[1]]), 
                  zscore_WT = z_scores[["WT"]],
                  zscore_6M = z_scores[["6M"]], 
                  zscore_RBDm = z_scores[["RBDm"]])
xparam <- c("zscore_6M", "zscore_6M", "zscore_RBDm")
yparam <- c("zscore_WT", "zscore_RBDm", "zscore_WT")

for(i in 1:length(xparam)){
  cat("#### ", xparam[i], " vs. ",  yparam[i], "\n")
  plot_oligomer(dfz, xparam, yparam, i, label = "hexamer")
  cat("\n\n")
}
```
 

# Peaks split by location

We separate the protein coding and spliced lncRNAs from the noncoding RNAs and compare the hexamers enrichment in introns, exons and UTRs.
We focus on the WT sample, because we first want to discover the known GU-repeat motif of intronic targets.

```{r separate-pc-ncRNA}
## we get the gene IDs for each of the subsets and prepare gene annotations for each of the sets.
##seperate protein coding and long lncRNAs from the rest
g <- gtf[gtf$gene_id %in% clus_an[["WT"]]$gene_id & gtf$type == "gene"]
length(g)
pc <- g[g$gene_type == "protein_coding"]$gene_id
length(pc)
## separate spliced from unspliced lncRNAs
lnc <- g[g$gene_type == "lncRNA"]$gene_id
length(lnc)
g1 <- gtf[gtf$gene_id %in% lnc]
e <- g1[g1$type == "exon"]
sp <- split(e$exon_number, e$gene_id)
## all lncRNAs with more than 1 exon
lnc_spl <- names(sp)[lengths(lapply(sp, unique)) > 1]
length(lnc_spl)
## all spliced genes, that need to be checked for potential false positives
pc <- c(pc, g[g$gene_id %in% lnc_spl]$gene_id)
length(pc)
## short RNAs
short <- clus_an[["WT"]]$gene_id[!clus_an[["WT"]]$gene_id %in% pc] %>% unique
length(short)

length(short) + length(pc)
length(unique(clus_an[["WT"]]$gene_id))
```

We prepare the annotation for the two gene sets

```{r prep-ann-split}
sample_an_pc <- prep_an(gtf, pc)
sample_an_nc <- prep_an(gtf, short)
sample_an_nc <- sample_an_nc[sapply(sample_an_nc, function(x) length(x) > 0)]
```


## window 61

```{r wind-61-pc-nc, warning = FALSE, message = FALSE}
hexamer_zscore_single_set <- function(wind_size = 41, clusters, sample_an){
  half_wind <- (wind_size-1)/2
  ## Cluster window regions
  wind <- GRanges(seqnames(clusters), 
                  IRanges(start(clusters)-half_wind, end(clusters)+half_wind), 
                  strand = strand(clusters), ID = clusters$ID)
  
  ## Annotation regions overlapping with at least half of a window
  sample_an_wind <- lapply(sample_an, function(a) {
    a[queryHits(findOverlaps(a, wind, minoverlap = half_wind+1))]
    })
  ## genomice sequence of windows and annotation regions
  wind_seq <- getSeq(genome, wind)
  bg_seq <- sample_an_wind %>% 
    GRangesList() %>% 
    unlist %>% 
    getSeq(x = genome, names = .)
  ## hexamer count in window
  obs <- oligonucleotideFrequency(wind_seq, width = 6, step = 1, 
                                    simplify.as = "collapsed")
  ## hexamer count in shuffled windows
  obs_sh <- list()
  for(i in 1:100){
    obs_sh[[i]] <- shuffled_oligomer_counts(b = bg_seq, 
                                            w_size = wind_size, nolig = 6)
  }
  obs_sh <- bind_cols(obs_sh)
  
  ## z-score
  sh_params <- data.frame(mean = rowMeans(obs_sh), sd = apply(obs_sh, 1, sd))

  (obs - sh_params$mean) / sh_params$sd
}

# separate pc from short genes
cluster_ids_pc <- clus_an[["WT"]]$cluster_id[clus_an[["WT"]]$gene_id %in% pc]
cluster_ids_nc <- clus_an[["WT"]]$cluster_id[clus_an[["WT"]]$gene_id %in% short]

## protein coding and spliced lncRNAs
z_scores_pc <- hexamer_zscore_single_set(wind_size = 61, 
                           del_cluster[["WT"]][del_cluster[["WT"]]$ID %in% cluster_ids_pc],
                           sample_an_pc)
summary(z_scores_pc)
z_scores_pc[order(z_scores_pc)][1:20]
z_scores_pc[order(z_scores_pc, decreasing = TRUE)][1:30]
z_scores_pc["GTGTGT"]
z_scores_pc["TGTGTG"]

## ncRNAs
z_scores_nc <- hexamer_zscore_single_set(wind_size = 61, 
                           del_cluster[["WT"]][del_cluster[["WT"]]$ID %in% cluster_ids_nc],
                           sample_an_nc)
summary(z_scores_nc)
z_scores_nc[order(z_scores_nc)][1:20]
z_scores_nc[order(z_scores_nc, decreasing = TRUE)][1:30]
z_scores_nc["GTGTGT"]
z_scores_nc["TGTGTG"]
```


### Split cluster locations within pc genes


```{r split-pc-peaks-location-61, message = FALSE}
hexamer_zscore_split <- function(wind_size = 41, clusters, sample_an){
  half_wind <- (wind_size-1)/2
  ## cluster window regions
  wind <- lapply(clusters, function(x) 
    GRanges(seqnames(x), IRanges(start(x)-half_wind, end(x)+half_wind), 
            strand = strand(x), ID = x$ID))
  ## Annotation regions overlapping with at least half of a window
  sample_an_wind <- lapply(names(sample_an), function(x) {
      sample_an[[x]][queryHits(findOverlaps(sample_an[[x]], 
                                            wind[[x]], 
                                            minoverlap = half_wind+1))]
    })
  names(sample_an_wind) <- names(sample_an)
  
  ## genomice sequence of windows and annotation regions
  wind_seq <- lapply(wind, function(x) getSeq(genome, x))
  bg_seq <- lapply(sample_an_wind, function(x) 
    GRangesList(x) %>% 
      unlist %>% 
      getSeq(x = genome, names = .))
  ## hexamer count in window
  obs <- lapply(wind_seq, function(x) {
    oligonucleotideFrequency(x, width = 6, step = 1, 
                                    simplify.as = "collapsed")
  })
  ## hexamer count in shuffled windows
  obs_sh <- lapply(names(bg_seq), function(x) {
    obs_sh <- list()
    for(i in 1:100){
      obs_sh[[i]] <- shuffled_oligomer_counts(b = bg_seq[[x]], 
                                              w_size = wind_size, nolig = 6)
    }
    obs_sh <- bind_cols(obs_sh)
    obs_sh
  })
  names(obs_sh) <- names(bg_seq)
  
  ## z-score
  sh_params <- lapply(obs_sh, function(x) {
      data.frame(mean = rowMeans(x), sd = apply(x, 1, sd))
  })
  
  z_scores <- lapply(names(obs), function(x) {
    (obs[[x]] - sh_params[[x]]$mean) / sh_params[[x]]$sd
  })
  names(z_scores) <- names(obs)
  z_scores
}

cluster_split_pc <- lapply(sample_an_pc, function(a){
  subsetByOverlaps(del_cluster[["WT"]][del_cluster[["WT"]]$ID %in% cluster_ids_pc], 
                   a)
})

```

```{r, eval = FALSE}
cluster_split_pc <- lapply(cluster_split_pc, function(x) if(length(x)) return(x))
```

```{r}    
z_scores_pc_split <- hexamer_zscore_split(wind_size = 61, cluster_split_pc, sample_an_pc)
lapply(z_scores_pc_split, summary)
lapply(z_scores_pc_split, function(x) x[order(x)][1:20])
lapply(z_scores_pc_split, function(x) x[order(x, decreasing = TRUE)][1:30])
lapply(z_scores_pc_split, function(x) x["GTGTGT"])
lapply(z_scores_pc_split, function(x) x["TGTGTG"])
```


#### Scatterplot {.tabset}

```{r z-score-plots-split-pc, results = "asis", message = FALSE}
dfz <- data.frame(hexamer = names(z_scores_pc_split[[1]]), 
                  zscore_exon = z_scores_pc_split[["exon"]],
                  zscore_3UTR = z_scores_pc_split[["three_prime_utr"]], 
                  zscore_5UTR = z_scores_pc_split[["five_prime_utr"]],
                  zscore_intron = z_scores_pc_split[["intron"]])

xparam <- c("zscore_exon", "zscore_3UTR", "zscore_5UTR", "zscore_exon", "zscore_exon", "zscore_3UTR")
yparam <- c("zscore_intron", "zscore_intron", "zscore_intron", "zscore_3UTR", "zscore_5UTR", "zscore_5UTR")

for(i in 1:length(xparam)){
  cat("##### ", xparam[i], " vs. ",  yparam[i], "\n")
  plot_oligomer(dfz, xparam, yparam, i, label = "hexamer")
  cat("\n\n")
}
```


# Distribution of gene type

```{r gene-sets}
## all target genes of WT, 6M and RBDm
gene_sets <- list(WT = gdf %>% dplyr::filter(nclusters_WT > 0), 
                  "6M" = gdf %>% dplyr::filter(nclusters_6M > 0),
                  RBDm = gdf %>% dplyr::filter(nclusters_RBDm > 0))
lapply(gene_sets, nrow)
## all target genes unique to WT, shared between WT and 6M 
## or shared between WT, 6M and RBDm
gene_sets_wt <- list(WT = gdf %>% 
                       dplyr::filter(nclusters_WT > 0 & nclusters_6M == 0 & nclusters_RBDm == 0), 
                     "6M" = gdf %>%