#!/bin/bash
##
## Reprocesses M Berlanga's CLiP using nf-core/clipseq, and only one read
##
## Tue Apr  5 08:28:56 CEST 2022

WD=/home/imallona/polymenidou_manu_clip/nextflow

mkdir -p $WD/data; cd $WD/data

for fn in $(find /home/kathi/Manu_TDP_CLIP/FASTQ/ -name "*R2*fastq.gz")
do
    ln -s $fn .
done


mkdir -p $WD/run
cd $_

cat << EOF > design.conf
sample,fastq
20200123.A-6M2_R2,/home/imallona/polymenidou_manu_clip/nextflow/data/20200123.A-6M2_R2.fastq.gz
20200123.A-6M_R2,/home/imallona/polymenidou_manu_clip/nextflow/data/20200123.A-6M_R2.fastq.gz
20200123.A-RBDm2_R2,/home/imallona/polymenidou_manu_clip/nextflow/data/20200123.A-RBDm2_R2.fastq.gz
20200123.A-RBDm_R2,/home/imallona/polymenidou_manu_clip/nextflow/data/20200123.A-RBDm_R2.fastq.gz
20200123.A-WT2_R2,/home/imallona/polymenidou_manu_clip/nextflow/data/20200123.A-WT2_R2.fastq.gz
20200123.A-WT_R2,/home/imallona/polymenidou_manu_clip/nextflow/data/20200123.A-WT_R2.fastq.gz
EOF


## beware, is trying to deduplicate something - and raises an error if setting
## --deduplicate False
## Channel `ch_aligned` has been used twice as an input by process `get_crosslinks` and process `rseqc`

## RNA premapping to smRNAs included
# As per their documentation, https://nf-co.re/clipseq/1.0.0/usage
# The pipeline comes equipped with some 'smallRNA' FASTA references for premapping.
# This includes rRNA and tRNA sequences, the sources of which can be viewed here.
# The purpose of this premapping is to capture abundant ncRNA that are present in
# multiple similar copies in the genome, making them hard to assign reads to.
# tRNA can occur within genes and without proper handling can result in misassignment
# of reads to mRNA in certain situations.

# https://rnajournal.cshlp.org/content/early/2018/08/21/rna.067348.118

## it's indeed tRNAs  mostly, not small RNAs otherwise
# zcat /home/imallona/.nextflow/assets/nf-core.old/clipseq/assets/small_rna/Homo_sapiens/Homo_sapiens.smallRNA.fa.gz | grep ">" 

nice -n 19 nextflow run --max_cpus 50  \
     nf-core/clipseq    \
     --genome GRCm38 \
     --smrna_org mouse    \
     --input design.conf \
     -profile singularity   \
     --peakcaller paraclu,piranha \
     -resume \
     --outdir out \
     --motif True