diff --git a/01_snv_analysis/01_snv_analysis.Rmd b/01_snv_analysis/01_snv_analysis.Rmd index c472e4907f6fd1925faad92ec6f4f0c8a12f96c5..a03e18988c2f123597fe43f50f83471815303dd8 100644 --- a/01_snv_analysis/01_snv_analysis.Rmd +++ b/01_snv_analysis/01_snv_analysis.Rmd @@ -18,6 +18,7 @@ knitr::opts_chunk$set(autodep = TRUE, cache = FALSE, ### Load packages + ```{r, message = FALSE} ## library(here) library(Rsamtools) @@ -32,6 +33,7 @@ library(ggpubr) library(stringr) library(RColorBrewer) library(GenomicAlignments) +library(memes) ``` ```{r} @@ -61,49 +63,56 @@ genes <- gtf[gtf$type =="gene"] We generate genomewide pileup tables for each bam file. The pileup is used to identify the location and frequency of mutations (including deletions). ```{r pileup-mut-table} -pu_res <- lapply(samples,function(x) NULL) %>% setNames(samples) -min_allel_depth <- 1 ## at least 1 read with deletion per sample - -for(s in samples){ - # sbp <- ScanBamParam(which = mp[[s]]) - ## require at least 3 reads with any mutation per position - ## show all mutations with at least 1 supporting read - pp <- PileupParam(max_depth = 30000, include_deletions = TRUE, - include_insertions=FALSE, - min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth) - ## compute pileup for both of the replicates - - bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]), - yieldSize=5e4, - index = file.path(WD, "BAM_deduplicated_clip5", - paste0(bams[[s]][1], ".bai")))) - - pu <- list("rep1" = data.frame(), "rep2" = data.frame()) - repeat { - res <- pileup(bf) - if(nrow(res) == 0L) - break - res <- res %>% dplyr::filter(nucleotide == "-") - pu[["rep1"]] <- rbind(pu[["rep1"]], res) - } - close(bf) - - bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]), - yieldSize=5e4, - index = file.path(WD, "BAM_deduplicated_clip5", - paste0(bams[[s]][2], ".bai")))) - - repeat { - res <- pileup(bf) - if(nrow(res) == 0L) - break - res <- res %>% dplyr::filter(nucleotide == "-") - pu[["rep2"]] <- rbind(pu[["rep2"]], res) - } - close(bf) - lapply(pu, dim) - pu_res[[s]] <- pu +if (!file.exists('pu_res.rds')) { + pu_res <- lapply(samples,function(x) NULL) %>% setNames(samples) + min_allel_depth <- 1 ## at least 1 read with deletion per sample + + for(s in samples){ + # sbp <- ScanBamParam(which = mp[[s]]) + ## require at least 3 reads with any mutation per position + ## show all mutations with at least 1 supporting read + pp <- PileupParam(max_depth = 30000, include_deletions = TRUE, + include_insertions=FALSE, + min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth) + ## compute pileup for both of the replicates + + bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]), + yieldSize=5e4, + index = file.path(WD, "BAM_deduplicated_clip5", + paste0(bams[[s]][1], ".bai")))) + + pu <- list("rep1" = data.frame(), "rep2" = data.frame()) + repeat { + res <- pileup(bf) + if(nrow(res) == 0L) + break + res <- res %>% dplyr::filter(nucleotide == "-") + pu[["rep1"]] <- rbind(pu[["rep1"]], res) + } + close(bf) + + bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]), + yieldSize=5e4, + index = file.path(WD, "BAM_deduplicated_clip5", + paste0(bams[[s]][2], ".bai")))) + + repeat { + res <- pileup(bf) + if(nrow(res) == 0L) + break + res <- res %>% dplyr::filter(nucleotide == "-") + pu[["rep2"]] <- rbind(pu[["rep2"]], res) + } + close(bf) + lapply(pu, dim) + + pu_res[[s]] <- pu + } + + saveRDS(pu_res, 'pu_res.rds') +} else { + pu_res <- readRDS('pu_res.rds') } ``` @@ -283,7 +292,7 @@ lapply(clus_an, function(x) x$gene_id %>% unique %>% length) We filter out protein coding genes and spliced lncRNAs where all clusters overlap with small RNAs, because we saw in IGV that all these examples show now evidence of actual binding to the long genes. -@todo update Izaskun, what about the snoRNAs, scaRNAs, snRNAs etc? + ```{r identify-wrong-pc-target-genes} to_filter <- lapply(names(clus_an), function(s) { @@ -323,7 +332,7 @@ to_filter <- lapply(names(clus_an), function(s) { } return(NULL) }) - unique(unlist(res)) ## all genes that are no true targets + print(unique(unlist(res))) ## all genes that are no true targets }) names(to_filter) <- names(clus_an) lengths(to_filter) @@ -404,6 +413,9 @@ write.table(gdf, "gene_deletion_cluster_count.txt", quote = FALSE, sep = "\t", row.names = FALSE) ``` +```{r} +table(gdf$gene_type) +``` ## Venn diagram @@ -467,25 +479,41 @@ Most unique genes have only one cluster and some two clusters. We center a window on the deletion clusters and compute the oligomer enrichment. + + +```{r addutrinfotogtf} + +txdb1 <- makeTxDbFromGRanges(gtf) +utr3 <- threeUTRsByTranscript(txdb1, use.names=TRUE) + +``` + ```{r prepare-annotation} # genes <- mp_an[["RBDm"]]$gene_id prep_an <- function(gtf, genes){ g <- gtf[gtf$gene_id %in% genes] exon <- g[g$type == "exon"] %>% unique - five_utr <- g[g$type == "five_prime_utr"] %>% unique - three_utr <- g[g$type == "three_prime_utr"] %>% unique + ## utr <- g[g$type == "UTR"] %>% unique + + anno <- GRangesList(exon = exon_unique, utr = utr) + + ## intron annotation + txdb <- makeTxDbFromGRanges(g) + introns <- unlist(intronsByTranscript(txdb)) + five_utr <- unlist(fiveUTRsByTranscript(txdb)) + three_utr <- unlist(threeUTRsByTranscript(txdb)) ## We remove all 3' and 5' UTR regions that overlap with any exons exon_utr <- GenomicRanges::setdiff(exon, three_utr) exon_unique <- GenomicRanges::setdiff(exon_utr, five_utr) %>% unique + # We remove 3'UTR regions that overlap with 5'UTR regions three_utr_unique <- GenomicRanges::setdiff(three_utr, five_utr) %>% unique + anno <- GRangesList(exon = exon_unique, three_prime_utr = three_utr_unique, - five_prime_utr = five_utr) - ## intron annotation - txdb <- makeTxDbFromGRanges(g) - introns <- unlist(intronsByTranscript(txdb)) + five_prime_utr = five_utr) + ## remove the intronic parts that overlap with exons from other transcripts anno[["intron"]] <- GenomicRanges::setdiff(introns, c(anno[["exon"]], anno[["three_prime_utr"]], @@ -493,7 +521,9 @@ prep_an <- function(gtf, genes){ ## reduce potentially overlapping ranges lapply(anno, reduce) } +``` +```{r} sample_an <- lapply(names(clus_an), function(n) prep_an(gtf, clus_an[[n]]$gene_id %>% unique)) names(sample_an) <- names(clus_an) @@ -754,6 +784,7 @@ for(i in 1:length(xparam)){ + # Peaks split by location We separate the protein coding and spliced lncRNAs from the noncoding RNAs and compare the hexamers enrichment in introns, exons and UTRs. @@ -763,6 +794,7 @@ We focus on the WT sample, because we first want to discover the known GU-repeat ## we get the gene IDs for each of the subsets and prepare gene annotations for each of the sets. ##seperate protein coding and long lncRNAs from the rest g <- gtf[gtf$gene_id %in% clus_an[["WT"]]$gene_id & gtf$type == "gene"] +length(g) pc <- g[g$gene_type == "protein_coding"]$gene_id length(pc) ## separate spliced from unspliced lncRNAs @@ -794,9 +826,6 @@ sample_an_nc <- sample_an_nc[sapply(sample_an_nc, function(x) length(x) > 0)] ``` -```{r} -knitr::knit_exit() -``` ## window 61 @@ -1045,6 +1074,7 @@ svg("gene_type.svg"); p; dev.off() ``` Chi-square goodness of fit test to compare observed to expected (WT) distribution. + ```{r type-chi-square-test} ## 6M vs. WT prop (counts <- table(gene_sets[["6M"]]$gene_type_category)) @@ -1118,10 +1148,18 @@ ggplot(df, aes(x = gene_set, y = gene_length)) + ``` We test for significant differences between the distributiosn with pairwise Mann–Whitney U tests and correct for multiple testing. + ```{r test-sign-gene-length} pairwise.wilcox.test(df$gene_length, df$gene_set, p.adjust.method = "BH") ``` + +```{r} + +knitr::knit_exit() +``` + +Peaks come from clip5_mp_analysis.Rmd ; these run on clipper IDR-harmonized outputs. # Comparison with peaks diff --git a/02_nextflow_clipseq/02_nextflow_clipseq.sh b/02_nextflow_clipseq/02_nextflow_clipseq.sh index 4161826184ba664549f0cd6260e07d3309aad5f9..9f6d717233f860a640c7e1f794081e63a09e48ac 100644 --- a/02_nextflow_clipseq/02_nextflow_clipseq.sh +++ b/02_nextflow_clipseq/02_nextflow_clipseq.sh @@ -54,4 +54,6 @@ nice -n 19 nextflow run --max_cpus 50 \ -profile singularity \ --peakcaller paraclu,piranha \ -resume \ - --outdir out + --outdir out \ + --motif True + diff --git a/03_postprocessing_snv_analysis/01_dreme_motifs.sh b/03_postprocessing_snv_analysis/01_dreme_motifs.sh new file mode 100644 index 0000000000000000000000000000000000000000..2a99776c6de9544fe6ce78decee129b22ffefa14 --- /dev/null +++ b/03_postprocessing_snv_analysis/01_dreme_motifs.sh @@ -0,0 +1,84 @@ +#!/bin/bash +## +## +## + +DATA=/home/imallona/src/polymenidou_manu_clip/01_snv_analysis +BEDTOOLS=/home/imallona/soft/bedtools/bedtools-2.29.2/bin/bedtools +WD=/home/imallona/src/polymenidou_manu_clip/03_postprocessing_snv_analysis +# motif_sample=2000 +fai=hg38.genome +fasta=~/polymenidou_manu_clip/reference/GRCh38.primary_assembly.genome.fa +DREME=~/soft/meme/bin/dreme +STREME=~/soft/meme/bin/streme +DUST=~/soft/meme/bin/dust ## to remove homopolymers + +cd $WD +mkdir -p meme +cd $_ + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \ + "select chrom, size from hg38.chromInfo" > hg38.genome + + +for fn in $(find $DATA -name "deletion_pos_min_3reads*.bed") +do + curr=$(basename $fn .bed) + motif_sample=$(wc -l $fn | cut -f1 -d" ") + + # if [ "$motif_sample" -gt "3000" ]; then + # motif_sample=3000 + # fi + + motif_sample=2000 + + + egrep "^chr[0-9XY]" $fn | \ + $BEDTOOLS slop -s -l 10 -r 10 -i /dev/stdin -g $fai | \ + shuf -n $motif_sample > "$curr"_peaks.bed + + $BEDTOOLS getfasta -s -fi $fasta -bed "$curr"_peaks.bed -fo "$curr"_peaks.fasta + + $DUST "$curr"_peaks.fasta > "$curr"_dusted.fasta + + $DREME -norc -o dreme_"$curr" -p "$curr"_dusted.fasta + + $STREME --p "$curr"_dusted.fasta \ + -o streme_"$curr" \ + --minw 3 \ + --maxw 30 + +done + + +# now for the clusters + + +for fn in $(find $DATA -name "deletion_cluster_dist_10_min_3reads_*.bed") +do + curr=$(basename $fn .bed) + motif_sample=$(wc -l $fn | cut -f1 -d" ") + + # if [ "$motif_sample" -gt "3000" ]; then + # motif_sample=3000 + # fi + + motif_sample=1000 + + + egrep "^chr[0-9XY]" $fn | \ + $BEDTOOLS slop -s -l 10 -r 10 -i /dev/stdin -g $fai | \ + shuf -n $motif_sample > "$curr"_peaks.bed + + $BEDTOOLS getfasta -s -fi $fasta -bed "$curr"_peaks.bed -fo "$curr"_peaks.fasta + + $DUST "$curr"_peaks.fasta > "$curr"_dusted.fasta + + $DREME -norc -o dreme_"$curr" -p "$curr"_dusted.fasta + + $STREME --p "$curr"_dusted.fasta \ + -o streme_"$curr" \ + --minw 3 \ + --maxw 30 + +done diff --git a/README.md b/README.md index f02c503c8372edc205c7c2f4158bfd902b782c10..901dd132e23d4d8f9e2c8f1e18f7234c3ecbf264 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,11 @@ The snakefile activates and deactivates (?) - `01_snv_analysis`, largely adapted from K. Hembach's crosslinks, postprocesses `00` - `02_nextflow_clipseq`, nfcore clipseq from scratch + +## Important note about the nextflow clipseq + +The deduplication step was removed manually from editing the nf files; crashed with the original files using --deduplicate False. A copy of the nf files is available at `nextflow_edited` + ## Trace Izaskun Mallona