From e87acb1e29c4989937b120d883c29273186f28b7 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Wed, 6 Apr 2022 15:31:14 +0200
Subject: [PATCH] Continue Kathi's analysis; add some MEME scans

---
 01_snv_analysis/01_snv_analysis.Rmd           | 144 +++++++++++-------
 02_nextflow_clipseq/02_nextflow_clipseq.sh    |   4 +-
 .../01_dreme_motifs.sh                        |  84 ++++++++++
 README.md                                     |   5 +
 4 files changed, 183 insertions(+), 54 deletions(-)
 create mode 100644 03_postprocessing_snv_analysis/01_dreme_motifs.sh

diff --git a/01_snv_analysis/01_snv_analysis.Rmd b/01_snv_analysis/01_snv_analysis.Rmd
index c472e49..a03e189 100644
--- a/01_snv_analysis/01_snv_analysis.Rmd
+++ b/01_snv_analysis/01_snv_analysis.Rmd
@@ -18,6 +18,7 @@ knitr::opts_chunk$set(autodep = TRUE, cache = FALSE,
 
 
 ### Load packages
+
 ```{r, message = FALSE}
 ## library(here)
 library(Rsamtools)
@@ -32,6 +33,7 @@ library(ggpubr)
 library(stringr)
 library(RColorBrewer)
 library(GenomicAlignments)
+library(memes)
 ```
 
 ```{r}
@@ -61,49 +63,56 @@ genes <- gtf[gtf$type =="gene"]
 We generate genomewide pileup tables for each bam file. The pileup is used to identify the location and frequency of mutations (including deletions).
 
 ```{r pileup-mut-table}
-pu_res <- lapply(samples,function(x) NULL)  %>% setNames(samples)
-min_allel_depth <- 1 ## at least 1 read with deletion per sample
-
-for(s in samples){
-  # sbp <- ScanBamParam(which = mp[[s]])
-  ## require at least 3 reads with any mutation per position
-  ## show all mutations with at least 1 supporting read
-  pp <- PileupParam(max_depth = 30000, include_deletions = TRUE, 
-                    include_insertions=FALSE,
-                    min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth)
-  ## compute pileup for both of the replicates
-  
-  bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]), 
-                     yieldSize=5e4, 
-                     index = file.path(WD, "BAM_deduplicated_clip5", 
-                                  paste0(bams[[s]][1], ".bai"))))
-  
-  pu <- list("rep1" = data.frame(), "rep2" = data.frame())
-  repeat {
-      res <- pileup(bf)
-      if(nrow(res) == 0L)
-          break
-      res <- res %>% dplyr::filter(nucleotide == "-")
-      pu[["rep1"]] <- rbind(pu[["rep1"]], res)
-  }
-  close(bf)
-  
-  bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]), 
-                     yieldSize=5e4,
-                     index = file.path(WD, "BAM_deduplicated_clip5", 
-                                  paste0(bams[[s]][2], ".bai"))))
-  
-  repeat {
-      res <- pileup(bf)
-      if(nrow(res) == 0L)
-          break
-      res <- res %>% dplyr::filter(nucleotide == "-")
-      pu[["rep2"]] <- rbind(pu[["rep2"]], res)
-  }
-  close(bf)
-  lapply(pu, dim)
 
-  pu_res[[s]] <- pu
+if (!file.exists('pu_res.rds')) {
+    pu_res <- lapply(samples,function(x) NULL)  %>% setNames(samples)
+    min_allel_depth <- 1 ## at least 1 read with deletion per sample
+
+    for(s in samples){
+                                        # sbp <- ScanBamParam(which = mp[[s]])
+        ## require at least 3 reads with any mutation per position
+        ## show all mutations with at least 1 supporting read
+        pp <- PileupParam(max_depth = 30000, include_deletions = TRUE, 
+                          include_insertions=FALSE,
+                          min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth)
+        ## compute pileup for both of the replicates
+        
+        bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]), 
+                           yieldSize=5e4, 
+                           index = file.path(WD, "BAM_deduplicated_clip5", 
+                                             paste0(bams[[s]][1], ".bai"))))
+        
+        pu <- list("rep1" = data.frame(), "rep2" = data.frame())
+        repeat {
+            res <- pileup(bf)
+            if(nrow(res) == 0L)
+                break
+            res <- res %>% dplyr::filter(nucleotide == "-")
+            pu[["rep1"]] <- rbind(pu[["rep1"]], res)
+        }
+        close(bf)
+        
+        bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]), 
+                           yieldSize=5e4,
+                           index = file.path(WD, "BAM_deduplicated_clip5", 
+                                             paste0(bams[[s]][2], ".bai"))))
+        
+        repeat {
+            res <- pileup(bf)
+            if(nrow(res) == 0L)
+                break
+            res <- res %>% dplyr::filter(nucleotide == "-")
+            pu[["rep2"]] <- rbind(pu[["rep2"]], res)
+        }
+        close(bf)
+        lapply(pu, dim)
+
+        pu_res[[s]] <- pu
+    }
+
+    saveRDS(pu_res, 'pu_res.rds')
+} else {
+    pu_res <- readRDS('pu_res.rds')
 }
 ```
 
@@ -283,7 +292,7 @@ lapply(clus_an, function(x) x$gene_id %>% unique %>% length)
 
 We filter out protein coding genes and spliced lncRNAs where all clusters overlap with small RNAs, because we saw in IGV that all these examples show now evidence of actual binding to the long genes.
 
-@todo update Izaskun, what about the snoRNAs, scaRNAs, snRNAs etc? 
+
 
 ```{r identify-wrong-pc-target-genes}
 to_filter <- lapply(names(clus_an), function(s) {
@@ -323,7 +332,7 @@ to_filter <- lapply(names(clus_an), function(s) {
     } 
     return(NULL)
   })
-  unique(unlist(res)) ## all genes that are no true targets
+  print(unique(unlist(res))) ## all genes that are no true targets
 })
 names(to_filter) <- names(clus_an)
 lengths(to_filter)
@@ -404,6 +413,9 @@ write.table(gdf, "gene_deletion_cluster_count.txt",
             quote = FALSE, sep = "\t", row.names = FALSE)
 ```
 
+```{r}
+table(gdf$gene_type)
+```
 
 ## Venn diagram
 
@@ -467,25 +479,41 @@ Most unique genes have only one cluster and some two clusters.
 
 We center a window on the deletion clusters and compute the oligomer enrichment.
 
+
+
+```{r addutrinfotogtf}
+
+txdb1 <- makeTxDbFromGRanges(gtf)
+utr3 <- threeUTRsByTranscript(txdb1, use.names=TRUE)
+
+```
+
 ```{r prepare-annotation}
 # genes <- mp_an[["RBDm"]]$gene_id
 prep_an <- function(gtf, genes){
   g <- gtf[gtf$gene_id %in% genes]
   
   exon <- g[g$type == "exon"] %>% unique
-  five_utr <- g[g$type == "five_prime_utr"] %>% unique
-  three_utr <- g[g$type == "three_prime_utr"] %>% unique
+  ## utr <- g[g$type == "UTR"] %>% unique
+
+  anno <- GRangesList(exon = exon_unique, utr = utr)
+  
+  ## intron annotation
+  txdb <- makeTxDbFromGRanges(g)
+  introns <- unlist(intronsByTranscript(txdb))
+  five_utr <- unlist(fiveUTRsByTranscript(txdb))
+  three_utr <- unlist(threeUTRsByTranscript(txdb))
 
   ## We remove all 3' and 5' UTR regions that overlap with any exons
   exon_utr <- GenomicRanges::setdiff(exon, three_utr)
   exon_unique <- GenomicRanges::setdiff(exon_utr, five_utr) %>% unique
+  
   # We remove 3'UTR regions that overlap with 5'UTR regions
   three_utr_unique <- GenomicRanges::setdiff(three_utr, five_utr) %>% unique
+
   anno <- GRangesList(exon = exon_unique, three_prime_utr = three_utr_unique, 
-             five_prime_utr = five_utr)
-  ## intron annotation
-  txdb <- makeTxDbFromGRanges(g)
-  introns <- unlist(intronsByTranscript(txdb))
+                      five_prime_utr = five_utr)
+  
   ## remove the intronic parts that overlap with exons from other transcripts
   anno[["intron"]] <- GenomicRanges::setdiff(introns, c(anno[["exon"]], 
                                          anno[["three_prime_utr"]], 
@@ -493,7 +521,9 @@ prep_an <- function(gtf, genes){
   ## reduce potentially overlapping ranges
   lapply(anno, reduce)
 }
+```
 
+```{r}
 sample_an <- lapply(names(clus_an), function(n) 
   prep_an(gtf, clus_an[[n]]$gene_id %>% unique))
 names(sample_an) <- names(clus_an)
@@ -754,6 +784,7 @@ for(i in 1:length(xparam)){
  
 
 
+
 # Peaks split by location
 
 We separate the protein coding and spliced lncRNAs from the noncoding RNAs and compare the hexamers enrichment in introns, exons and UTRs.
@@ -763,6 +794,7 @@ We focus on the WT sample, because we first want to discover the known GU-repeat
 ## we get the gene IDs for each of the subsets and prepare gene annotations for each of the sets.
 ##seperate protein coding and long lncRNAs from the rest
 g <- gtf[gtf$gene_id %in% clus_an[["WT"]]$gene_id & gtf$type == "gene"]
+length(g)
 pc <- g[g$gene_type == "protein_coding"]$gene_id
 length(pc)
 ## separate spliced from unspliced lncRNAs
@@ -794,9 +826,6 @@ sample_an_nc <- sample_an_nc[sapply(sample_an_nc, function(x) length(x) > 0)]
 ```
 
 
-```{r}
-knitr::knit_exit()
-```
 
 ## window 61
 
@@ -1045,6 +1074,7 @@ svg("gene_type.svg"); p; dev.off()
 ```
 
 Chi-square goodness of fit test to compare observed to expected (WT) distribution. 
+
 ```{r type-chi-square-test}
 ## 6M vs. WT prop
 (counts <- table(gene_sets[["6M"]]$gene_type_category))
@@ -1118,10 +1148,18 @@ ggplot(df, aes(x = gene_set, y = gene_length)) +
 ```
 
 We test for significant differences between the distributiosn with pairwise Mannâ€“Whitney U tests and correct for multiple testing.
+
 ```{r test-sign-gene-length}
 pairwise.wilcox.test(df$gene_length, df$gene_set, p.adjust.method = "BH")
 ```
 
+
+```{r}
+
+knitr::knit_exit()
+```
+ 
+Peaks come from clip5_mp_analysis.Rmd ; these run on clipper IDR-harmonized outputs.
  
 # Comparison with peaks
 
diff --git a/02_nextflow_clipseq/02_nextflow_clipseq.sh b/02_nextflow_clipseq/02_nextflow_clipseq.sh
index 4161826..9f6d717 100644
--- a/02_nextflow_clipseq/02_nextflow_clipseq.sh
+++ b/02_nextflow_clipseq/02_nextflow_clipseq.sh
@@ -54,4 +54,6 @@ nice -n 19 nextflow run --max_cpus 50  \
      -profile singularity   \
      --peakcaller paraclu,piranha \
      -resume \
-     --outdir out
+     --outdir out \
+     --motif True
+     
diff --git a/03_postprocessing_snv_analysis/01_dreme_motifs.sh b/03_postprocessing_snv_analysis/01_dreme_motifs.sh
new file mode 100644
index 0000000..2a99776
--- /dev/null
+++ b/03_postprocessing_snv_analysis/01_dreme_motifs.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+##
+##
+##
+
+DATA=/home/imallona/src/polymenidou_manu_clip/01_snv_analysis
+BEDTOOLS=/home/imallona/soft/bedtools/bedtools-2.29.2/bin/bedtools
+WD=/home/imallona/src/polymenidou_manu_clip/03_postprocessing_snv_analysis
+# motif_sample=2000
+fai=hg38.genome
+fasta=~/polymenidou_manu_clip/reference/GRCh38.primary_assembly.genome.fa
+DREME=~/soft/meme/bin/dreme
+STREME=~/soft/meme/bin/streme
+DUST=~/soft/meme/bin/dust ## to remove homopolymers
+
+cd $WD
+mkdir -p meme
+cd $_
+
+mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
+      "select chrom, size from hg38.chromInfo" > hg38.genome
+
+
+for fn in $(find $DATA -name "deletion_pos_min_3reads*.bed")          
+do
+    curr=$(basename $fn .bed)
+    motif_sample=$(wc -l $fn | cut -f1 -d" ")
+
+    # if [ "$motif_sample" -gt "3000" ]; then
+    #     motif_sample=3000
+    # fi
+
+    motif_sample=2000
+    
+
+    egrep "^chr[0-9XY]" $fn | \
+        $BEDTOOLS slop -s -l 10 -r 10 -i /dev/stdin -g $fai | \
+        shuf -n $motif_sample > "$curr"_peaks.bed
+
+    $BEDTOOLS getfasta -s -fi $fasta -bed "$curr"_peaks.bed -fo "$curr"_peaks.fasta
+
+    $DUST "$curr"_peaks.fasta > "$curr"_dusted.fasta
+
+    $DREME -norc -o dreme_"$curr" -p "$curr"_dusted.fasta
+
+    $STREME --p "$curr"_dusted.fasta \
+            -o streme_"$curr" \
+            --minw 3 \
+            --maxw 30
+            
+done
+
+
+# now for the clusters
+
+
+for fn in $(find $DATA -name "deletion_cluster_dist_10_min_3reads_*.bed")          
+do
+    curr=$(basename $fn .bed)
+    motif_sample=$(wc -l $fn | cut -f1 -d" ")
+
+    # if [ "$motif_sample" -gt "3000" ]; then
+    #     motif_sample=3000
+    # fi
+
+    motif_sample=1000
+    
+
+    egrep "^chr[0-9XY]" $fn | \
+        $BEDTOOLS slop -s -l 10 -r 10 -i /dev/stdin -g $fai | \
+        shuf -n $motif_sample > "$curr"_peaks.bed
+
+    $BEDTOOLS getfasta -s -fi $fasta -bed "$curr"_peaks.bed -fo "$curr"_peaks.fasta
+
+    $DUST "$curr"_peaks.fasta > "$curr"_dusted.fasta
+
+    $DREME -norc -o dreme_"$curr" -p "$curr"_dusted.fasta
+
+    $STREME --p "$curr"_dusted.fasta \
+            -o streme_"$curr" \
+            --minw 3 \
+            --maxw 30
+
+done
diff --git a/README.md b/README.md
index f02c503..901dd13 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,11 @@ The snakefile activates and deactivates (?)
 - `01_snv_analysis`, largely adapted from K. Hembach's crosslinks, postprocesses `00`
 - `02_nextflow_clipseq`, nfcore clipseq from scratch
 
+
+## Important note about the nextflow clipseq
+
+The deduplication step was removed manually from editing the nf files; crashed with the original files using --deduplicate False. A copy of the nf files is available at `nextflow_edited` 
+
 ## Trace
 
 Izaskun Mallona
-- 
GitLab