Skip to content
Snippets Groups Projects
Commit e87acb1e authored by Izaskun Mallona's avatar Izaskun Mallona
Browse files

Continue Kathi's analysis; add some MEME scans

parent e79ccd8b
No related branches found
No related tags found
No related merge requests found
......@@ -18,6 +18,7 @@ knitr::opts_chunk$set(autodep = TRUE, cache = FALSE,
### Load packages
```{r, message = FALSE}
## library(here)
library(Rsamtools)
......@@ -32,6 +33,7 @@ library(ggpubr)
library(stringr)
library(RColorBrewer)
library(GenomicAlignments)
library(memes)
```
```{r}
......@@ -61,49 +63,56 @@ genes <- gtf[gtf$type =="gene"]
We generate genomewide pileup tables for each bam file. The pileup is used to identify the location and frequency of mutations (including deletions).
```{r pileup-mut-table}
pu_res <- lapply(samples,function(x) NULL) %>% setNames(samples)
min_allel_depth <- 1 ## at least 1 read with deletion per sample
for(s in samples){
# sbp <- ScanBamParam(which = mp[[s]])
## require at least 3 reads with any mutation per position
## show all mutations with at least 1 supporting read
pp <- PileupParam(max_depth = 30000, include_deletions = TRUE,
include_insertions=FALSE,
min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth)
## compute pileup for both of the replicates
bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]),
yieldSize=5e4,
index = file.path(WD, "BAM_deduplicated_clip5",
paste0(bams[[s]][1], ".bai"))))
pu <- list("rep1" = data.frame(), "rep2" = data.frame())
repeat {
res <- pileup(bf)
if(nrow(res) == 0L)
break
res <- res %>% dplyr::filter(nucleotide == "-")
pu[["rep1"]] <- rbind(pu[["rep1"]], res)
}
close(bf)
bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]),
yieldSize=5e4,
index = file.path(WD, "BAM_deduplicated_clip5",
paste0(bams[[s]][2], ".bai"))))
repeat {
res <- pileup(bf)
if(nrow(res) == 0L)
break
res <- res %>% dplyr::filter(nucleotide == "-")
pu[["rep2"]] <- rbind(pu[["rep2"]], res)
}
close(bf)
lapply(pu, dim)
pu_res[[s]] <- pu
if (!file.exists('pu_res.rds')) {
pu_res <- lapply(samples,function(x) NULL) %>% setNames(samples)
min_allel_depth <- 1 ## at least 1 read with deletion per sample
for(s in samples){
# sbp <- ScanBamParam(which = mp[[s]])
## require at least 3 reads with any mutation per position
## show all mutations with at least 1 supporting read
pp <- PileupParam(max_depth = 30000, include_deletions = TRUE,
include_insertions=FALSE,
min_nucleotide_depth=1, min_minor_allele_depth = min_allel_depth)
## compute pileup for both of the replicates
bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][1]),
yieldSize=5e4,
index = file.path(WD, "BAM_deduplicated_clip5",
paste0(bams[[s]][1], ".bai"))))
pu <- list("rep1" = data.frame(), "rep2" = data.frame())
repeat {
res <- pileup(bf)
if(nrow(res) == 0L)
break
res <- res %>% dplyr::filter(nucleotide == "-")
pu[["rep1"]] <- rbind(pu[["rep1"]], res)
}
close(bf)
bf <- open(BamFile(file.path(WD, "BAM_deduplicated_clip5", bams[[s]][2]),
yieldSize=5e4,
index = file.path(WD, "BAM_deduplicated_clip5",
paste0(bams[[s]][2], ".bai"))))
repeat {
res <- pileup(bf)
if(nrow(res) == 0L)
break
res <- res %>% dplyr::filter(nucleotide == "-")
pu[["rep2"]] <- rbind(pu[["rep2"]], res)
}
close(bf)
lapply(pu, dim)
pu_res[[s]] <- pu
}
saveRDS(pu_res, 'pu_res.rds')
} else {
pu_res <- readRDS('pu_res.rds')
}
```
......@@ -283,7 +292,7 @@ lapply(clus_an, function(x) x$gene_id %>% unique %>% length)
We filter out protein coding genes and spliced lncRNAs where all clusters overlap with small RNAs, because we saw in IGV that all these examples show now evidence of actual binding to the long genes.
@todo update Izaskun, what about the snoRNAs, scaRNAs, snRNAs etc?
```{r identify-wrong-pc-target-genes}
to_filter <- lapply(names(clus_an), function(s) {
......@@ -323,7 +332,7 @@ to_filter <- lapply(names(clus_an), function(s) {
}
return(NULL)
})
unique(unlist(res)) ## all genes that are no true targets
print(unique(unlist(res))) ## all genes that are no true targets
})
names(to_filter) <- names(clus_an)
lengths(to_filter)
......@@ -404,6 +413,9 @@ write.table(gdf, "gene_deletion_cluster_count.txt",
quote = FALSE, sep = "\t", row.names = FALSE)
```
```{r}
table(gdf$gene_type)
```
## Venn diagram
......@@ -467,25 +479,41 @@ Most unique genes have only one cluster and some two clusters.
We center a window on the deletion clusters and compute the oligomer enrichment.
```{r addutrinfotogtf}
txdb1 <- makeTxDbFromGRanges(gtf)
utr3 <- threeUTRsByTranscript(txdb1, use.names=TRUE)
```
```{r prepare-annotation}
# genes <- mp_an[["RBDm"]]$gene_id
prep_an <- function(gtf, genes){
g <- gtf[gtf$gene_id %in% genes]
exon <- g[g$type == "exon"] %>% unique
five_utr <- g[g$type == "five_prime_utr"] %>% unique
three_utr <- g[g$type == "three_prime_utr"] %>% unique
## utr <- g[g$type == "UTR"] %>% unique
anno <- GRangesList(exon = exon_unique, utr = utr)
## intron annotation
txdb <- makeTxDbFromGRanges(g)
introns <- unlist(intronsByTranscript(txdb))
five_utr <- unlist(fiveUTRsByTranscript(txdb))
three_utr <- unlist(threeUTRsByTranscript(txdb))
## We remove all 3' and 5' UTR regions that overlap with any exons
exon_utr <- GenomicRanges::setdiff(exon, three_utr)
exon_unique <- GenomicRanges::setdiff(exon_utr, five_utr) %>% unique
# We remove 3'UTR regions that overlap with 5'UTR regions
three_utr_unique <- GenomicRanges::setdiff(three_utr, five_utr) %>% unique
anno <- GRangesList(exon = exon_unique, three_prime_utr = three_utr_unique,
five_prime_utr = five_utr)
## intron annotation
txdb <- makeTxDbFromGRanges(g)
introns <- unlist(intronsByTranscript(txdb))
five_prime_utr = five_utr)
## remove the intronic parts that overlap with exons from other transcripts
anno[["intron"]] <- GenomicRanges::setdiff(introns, c(anno[["exon"]],
anno[["three_prime_utr"]],
......@@ -493,7 +521,9 @@ prep_an <- function(gtf, genes){
## reduce potentially overlapping ranges
lapply(anno, reduce)
}
```
```{r}
sample_an <- lapply(names(clus_an), function(n)
prep_an(gtf, clus_an[[n]]$gene_id %>% unique))
names(sample_an) <- names(clus_an)
......@@ -754,6 +784,7 @@ for(i in 1:length(xparam)){
# Peaks split by location
We separate the protein coding and spliced lncRNAs from the noncoding RNAs and compare the hexamers enrichment in introns, exons and UTRs.
......@@ -763,6 +794,7 @@ We focus on the WT sample, because we first want to discover the known GU-repeat
## we get the gene IDs for each of the subsets and prepare gene annotations for each of the sets.
##seperate protein coding and long lncRNAs from the rest
g <- gtf[gtf$gene_id %in% clus_an[["WT"]]$gene_id & gtf$type == "gene"]
length(g)
pc <- g[g$gene_type == "protein_coding"]$gene_id
length(pc)
## separate spliced from unspliced lncRNAs
......@@ -794,9 +826,6 @@ sample_an_nc <- sample_an_nc[sapply(sample_an_nc, function(x) length(x) > 0)]
```
```{r}
knitr::knit_exit()
```
## window 61
......@@ -1045,6 +1074,7 @@ svg("gene_type.svg"); p; dev.off()
```
Chi-square goodness of fit test to compare observed to expected (WT) distribution.
```{r type-chi-square-test}
## 6M vs. WT prop
(counts <- table(gene_sets[["6M"]]$gene_type_category))
......@@ -1118,10 +1148,18 @@ ggplot(df, aes(x = gene_set, y = gene_length)) +
```
We test for significant differences between the distributiosn with pairwise Mann–Whitney U tests and correct for multiple testing.
```{r test-sign-gene-length}
pairwise.wilcox.test(df$gene_length, df$gene_set, p.adjust.method = "BH")
```
```{r}
knitr::knit_exit()
```
Peaks come from clip5_mp_analysis.Rmd ; these run on clipper IDR-harmonized outputs.
# Comparison with peaks
......
......@@ -54,4 +54,6 @@ nice -n 19 nextflow run --max_cpus 50 \
-profile singularity \
--peakcaller paraclu,piranha \
-resume \
--outdir out
--outdir out \
--motif True
#!/bin/bash
##
##
##
DATA=/home/imallona/src/polymenidou_manu_clip/01_snv_analysis
BEDTOOLS=/home/imallona/soft/bedtools/bedtools-2.29.2/bin/bedtools
WD=/home/imallona/src/polymenidou_manu_clip/03_postprocessing_snv_analysis
# motif_sample=2000
fai=hg38.genome
fasta=~/polymenidou_manu_clip/reference/GRCh38.primary_assembly.genome.fa
DREME=~/soft/meme/bin/dreme
STREME=~/soft/meme/bin/streme
DUST=~/soft/meme/bin/dust ## to remove homopolymers
cd $WD
mkdir -p meme
cd $_
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
"select chrom, size from hg38.chromInfo" > hg38.genome
for fn in $(find $DATA -name "deletion_pos_min_3reads*.bed")
do
curr=$(basename $fn .bed)
motif_sample=$(wc -l $fn | cut -f1 -d" ")
# if [ "$motif_sample" -gt "3000" ]; then
# motif_sample=3000
# fi
motif_sample=2000
egrep "^chr[0-9XY]" $fn | \
$BEDTOOLS slop -s -l 10 -r 10 -i /dev/stdin -g $fai | \
shuf -n $motif_sample > "$curr"_peaks.bed
$BEDTOOLS getfasta -s -fi $fasta -bed "$curr"_peaks.bed -fo "$curr"_peaks.fasta
$DUST "$curr"_peaks.fasta > "$curr"_dusted.fasta
$DREME -norc -o dreme_"$curr" -p "$curr"_dusted.fasta
$STREME --p "$curr"_dusted.fasta \
-o streme_"$curr" \
--minw 3 \
--maxw 30
done
# now for the clusters
for fn in $(find $DATA -name "deletion_cluster_dist_10_min_3reads_*.bed")
do
curr=$(basename $fn .bed)
motif_sample=$(wc -l $fn | cut -f1 -d" ")
# if [ "$motif_sample" -gt "3000" ]; then
# motif_sample=3000
# fi
motif_sample=1000
egrep "^chr[0-9XY]" $fn | \
$BEDTOOLS slop -s -l 10 -r 10 -i /dev/stdin -g $fai | \
shuf -n $motif_sample > "$curr"_peaks.bed
$BEDTOOLS getfasta -s -fi $fasta -bed "$curr"_peaks.bed -fo "$curr"_peaks.fasta
$DUST "$curr"_peaks.fasta > "$curr"_dusted.fasta
$DREME -norc -o dreme_"$curr" -p "$curr"_dusted.fasta
$STREME --p "$curr"_dusted.fasta \
-o streme_"$curr" \
--minw 3 \
--maxw 30
done
......@@ -8,6 +8,11 @@ The snakefile activates and deactivates (?)
- `01_snv_analysis`, largely adapted from K. Hembach's crosslinks, postprocesses `00`
- `02_nextflow_clipseq`, nfcore clipseq from scratch
## Important note about the nextflow clipseq
The deduplication step was removed manually from editing the nf files; crashed with the original files using --deduplicate False. A copy of the nf files is available at `nextflow_edited`
## Trace
Izaskun Mallona
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment