NB: this is a convenience file, and will probably lag behind the main version. It’ll be updated periodically (note date above…)
# raw code
## s e t u p =======================================
# databases for different tools
DB=/data/databases # HPC only
HGR38_BT=/data/databases/hostremoval/Homo_sapiens/Bowtie2 # HPC only
K2REFDIR=/data/databases/kraken2 # HPC only
BT_DB=/data/Food/analysis/R0936_redcheese/ref
# our overall data structure
RAW=/data/Food/primary/R0936_redcheese
WRK=/data/Food/analysis/R0936_redcheese
# scripts, and slurms for queueing up jobs
MAT=$WRK/Materials
SLURM=$WRK/slurms
# outputs etc
QC=$WRK/1__qc
FQC=$QC/fastqc
FILT=$WRK/2__filt
KDOUT=$WRK/3__knead
KROUT=$WRK/4__krak2
#!# set a sample-name to test things on
TEST=Q4T8
mkdir $RAW $WRK $FILT $QC $FQC $KDOUT $KROUT
# make script dirs etc.
mkdir $MAT $SLURM
# make dirs for qc outputs
mkdir $FQC/fhi_redch_raw $FQC/fhi_redch_trimm $FQC/fhi_redch_raw_multi $FQC/fhi_redch_filt_multi
# load programs, including specific versions of bowtie2 and kracken
module load parallel fastqc multiqc trimmomatic bowtie2 samtools/1.10 kraken2/2.1.1 braken
## d a t a & Q C =======================================
# then run a fastqc for F and R files, output in the dirs we made above
fastqc -t 4 $RAW/${TEST}* -o $FQC/fhi_redch_raw
# make a report that includes both F and R reads for this sample
multiqc $FQC/fhi_redch_raw -o $FQC/fhi_redch_raw_multi
# write a slurm script first
echo '#!/bin/bash
SBATCH --job-name=knead_fastq
SBATCH --output=knead_fastq.txt
SBATCH --ntasks=15
SBATCH --time=15:00
IN=$1
OUT=$2
# time just tells us how long this takes, so we know for next time
# -t is the number of threads (tasks) to use
# curly brackets {} allow us to match either gz or bz2 file extensions
time fastqc -t 15 $IN/*fastq.{bz2,gz} -o $OUT
' > $SLURM/slurm_fqc.sh
# trust slurm
sbatch $SLURM/slurm_fqc.sh $RAW $FQC/fhi_redch_raw
# combine outputs
time multiqc $FQC/fhi_redch_raw -o $FQC/fhi_redch_raw_multi
# then copy multiqc output to local computer (try FileZilla), and open in your browser!
## 2. t r i m m =======================================
cho '>adapter_seq
CTGTCTCTTATACACATCT
>adapter_mate_seq
AGATGTGTATAAGAGACAG
>Transposase_Adap__for_tagmentation_1
TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
>Transposase_Adap__for_tagmentation_2
GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
>PCR_primer_index_1
CAAGCAGAAGACGGCATACGAGATNNNNNNNGTCTCGTGGGCTCGG
>PCR_primer_index_2
AATGATACGGCGACCACCGAGATCTACACNNNNNTCGTCGGCAGCGTC
>PCR_primer_index_2_rc
GACGCTGCCGACGANNNNNGTGTAGATCTCGGTGGTCGCCGTATCATT
>PCR_primer_index_1_rc
CCGAGCCCACGAGACNNNNNNNATCTCGTATGCCGTCTTCTGCTTG
>Transposase_Adap__for_tagmentation_2_rc
CTGTCTCTTATACACATCTCCGAGCCCACGAGAC
>Transposase_Adap__for_tagmentation_1_rc
CTGTCTCTTATACACATCTGACGCTGCCGACGA
>adapter_mate_seq_rc
CTGTCTCTTATACACATCT
>adapter_seq_rc
AGATGTGTATAAGAGACAG' > $MAT/fqc_trimmo_ill_ref.fa
trimmomatic PE \
$RAW/${TEST}_R1_001.fastq.gz $RAW/${TEST}_R2_001.fastq.gz \
$FILT/${TEST}_R1_trimm.fastq.gz $FILT/${TEST}_R1_trimm_unpaired.fastq.gz \
$FILT/${TEST}_R2_trimm.fastq.gz $FILT/${TEST}_R2_trimm_unpaired.fastq.gz \
HEADCROP:20 \
CROP:130 \
ILLUMINACLIP:$MAT/fqc_trimmo_ill_ref.fa:2:30:10:5 \
SLIDINGWINDOW:6:15 MINLEN:80 \
-threads 6 > $FILT/trimmo_${TEST}.out
# combine all those different parts!
ls $RAW/*fastq.gz | sed -e 's/.*\/\(.*\)_R._001.*/\1/g' | sort | uniq > $MAT/samples
# First, we make a slurm script for running Trimmomatic:
echo '#!/bin/bash
#SBATCH --job-name=trimmoRaw
#SBATCH --output=trimmoRaw.txt
#SBATCH --ntasks=6
#SBATCH --time=18:00
# these take the terms given after the scriptname, i.e. "... $i $RAW $FILT $MAT"
SAMPLE=$1
IN=$2
OUT=$3
REFDIR=$4
# trimmomatic - use backslash to separate rows
trimmomatic PE \
$IN/${SAMPLE}_R1.fastq.gz \
$IN/${SAMPLE}_R2.fastq.gz \
$OUT/${SAMPLE}_R1_trimm.fastq.gz \
$OUT/${SAMPLE}_R1_trimm_unpaired.fastq.gz \
$OUT/${SAMPLE}_R2_trimm.fastq.gz \
$OUT/${SAMPLE}_R2_trimm_unpaired.fastq.gz \
HEADCROP:25 \
CROP:125 \
ILLUMINACLIP:$REFDIR/fqc_trimmo_ill_ref.fa:2:30:10:5 \
SLIDINGWINDOW:6:15 \
MINLEN:80 \
-threads 6 > $OUT/trimmo_${SAMPLE}.out 2>&1' > $MAT/slurm_trimmo.sh
# this command lists all the sample names
cat $MAT/samples
# use in for loop
for i in $(cat $MAT/samples);
do sbatch $MAT/slurm_trimmo.sh $i $RAW $FILT $MAT;
done
fastqc -t 4 $FILT/*fastq.gz -o $FQC/fhi_redch_filt
# move unpaired reports away - avoid clogging up multiqc vis
mkdir $FQC/fhi_redch_filt_unpaired
mv $FQC/fhi_redch_filt/*unpaired* $FQC/fhi_redch_filt_unpaired
# look at just the trimmed, paired data (no unpaired reads)
multiqc $FQC/fhi_redch_filt/ -o $FQC/fhi_redch_filt_multi
## 3. d e c o n t a m =======================================
# download to our raw data folder!
echo '#!/bin/bash
cd $1
# download your own genome (more or less) - 1.2 GB, 12min :: from the NCBI browser https://www.ncbi.nlm.nih.gov/data-hub/genome/GCF_000001405.40/
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/GCF_000001405.40/download?filename=GCF_000001405.40.zip" -H "Accept: application/zip"
# decoy genome - 9 MB
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5ss.fa.gz
# cow genome - 825 MB, 5min
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/GCA_021234555.1/download?filename=GCA_021234555.1.zip" -H "Accept: application/zip"
# sheep genome - 890 MB, 4min
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/GCF_016772045.1/download?filename=GCF_016772045.1.zip" -H "Accept: application/zip"
# goat genome - 930 MB, 3min
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/GCF_001704415.2/download?filename=GCF_001704415.2.zip" -H "Accept: application/zip"
# chicken genome - 440 MB, 1min
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/GCF_016699485.2/download?filename=GCF_016699485.2.zip" -H "Accept: application/zip"' > $MAT/trimmo_genomes_dl.sh
# do check that curl is installed!
# if not, try activating it: "module load curl"
curl -h
# give "run" permission for script to be run, for the user only
chmod u+x $MAT/trimmo_genomes_dl.sh
# make a dir for genomes in our RAW folder
mkdir $RAW/ref
# run the script while pointing at those files
$MAT/trimmo_genomes_dl.sh $RAW/ref
##
## N O T E :: you will need to extract the genome files (.....fna) all by yourself!
##
# compress downloads
parallel gzip {} ::: $RAW/ref/*fna
BT_threads=10
# testing database build
time bowtie2-build --t $BT_threads $RAW/ref/hs37d5ss.fa.gz BT_DB/decoy_hs37d5ss.bt2 > $BT_DB/decoy_hs37d5ss.bt2.buildlog
## individual builds:
# decoy.9MB.fna.gz; 19 seconds for decoy, 9 threads, 15GB RAM
# chicken.340MB.fna.gz; 10min, 9 threads, 15GB RAM
# cow.853MB.fna.gz; 28m, 9 threads, 15GB RAM
# build ALL genomes into BT2 database via slurm:
echo '#!/bin/bash
#SBATCH --job-name=bt2DB
#SBATCH --output=bt2DB.txt
#SBATCH --ntasks=10
#SBATCH --time=179:00
INOUT=$1
# feel free to add / remove genomes that are relevant to you
# do check the names before you run!
time bowtie2-build --large-index --threads 10 \
$INOUT/GCA_021234555.1_ARS-LIC_NZ_Jersey_genomic.fna.gz,\
$INOUT/GCF_000001405.40_GRCh38.p14_genomic.fna.gz,\
$INOUT/GCF_001704415.2_ARS1.2_genomic.fna.gz,\
$INOUT/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_genomic.fna.gz,\
$INOUT/GCF_016772045.1_ARS-UI_Ramb_v2.0_genomic.fna.gz,\
$INOUT/hs37d5ss.fa.gz \
$INOUT/mult_ref #> $INOUT/mult_ref.buildlog # 2>&1
' > $MAT/slurm_bt2db.sh
# and then run
$MAT/slurm_bt2db.sh $BT_DB
## decontaminate one sample:
# 1 align $TEST sequences to ref using bt2.
bowtie2 -p $BT_threads -x $BT_DB/mult_ref -1 $FILT/${TEST}_R1_trimm.fastq.gz -2 $FILT/${TEST}_R2_trimm.fastq.gz -S $KDOUT/${TEST}_bt2_refmapped.sam
# 2 - view with samtools, and convert to bam (binary)
samtools view -bS $KDOUT/${TEST}_bt2_refmapped.sam > $KDOUT/${TEST}_bt2_refmapped.bam
# 3 - filter the contaminant sequences out: F 256 : exclude those mapping to ref; f 12 : F and R *both* unmapped
samtools view -b -f 12 -F 256 $KDOUT/${TEST}_bt2_refmapped.bam > $KDOUT/${TEST}_bt2_decontamd.bam
# 4 - ensure reads are in a sensible order
samtools sort -n -m 5G -@ 2 $KDOUT/${TEST}_bt2_decontamd.bam -o $KDOUT/${TEST}_bt2_decontamd_sort.bam > ${TEST}_bam_sort.out
# 5 - stream outputs of non-interest to /dev/null
samtools fastq -@ 8 $KDOUT/${TEST}_bt2_decontamd_sort.bam -1 $KDOUT/${TEST}_bt2decon_R1.fastq.gz -2 $KDOUT/${TEST}_bt2decon_R2.fastq.gz -0 /dev/null -s /dev/null -n
## look at the fastq!
ls -lsh $KDOUT/${TEST}*
# removing intermediates: SAM and BAM files are huge! delete --WHEN FINISHED--
rm $KDOUT/${TEST}*[sam,bam]
## decontaminate multiple samples
echo '#!/bin/bash
#SBATCH --job-name=deco_bt2.%j
#SBATCH --output=deco_bt2.%j.txt
SAMPLE=$1
IN=$2
OUT=$3
REF=$4
THREADS=$5
## stamp for the output
DAT_TIM=$(date +"%T %F")
echo "--------------------------------------------"
echo $DAT_TIM
echo sampl: $SAMPLE
echo input: $IN
echo outpt: $OUT
echo "--------------------------------------------"
echo "-- align -------------------------------------"
# -x $REF/mult_ref \
time bowtie2 -p $THREADS \
-x $REF/deco.bt2 \
-1 $IN/${SAMPLE}_R1_trimm.fastq.gz \
-2 $IN/${SAMPLE}_R2_trimm.fastq.gz \
-S $OUT/${SAMPLE}_bt2_refmapped.sam
if [ $?=0 && -s $OUT/${SAMPLE}_bt2_refmapped.sam] then
echo "-- change format -------------------------------------"
time samtools view -bS $OUT/${SAMPLE}_bt2_refmapped.sam > $OUT/${SAMPLE}_bt2_refmapped.bam
echo "-- remove matches -------------------------------------"
time samtools view -b -f 12 -F 256 $OUT/${SAMPLE}_bt2_refmapped.bam > $OUT/${SAMPLE}_bt2_decontamd.bam
echo "-- re-organise reads -------------------------------------"
time samtools sort -n -m 5G -@ $THREADS $OUT/${SAMPLE}_bt2_decontamd.bam -o $OUT/${SAMPLE}_bt2_decontamd_sort.bam
echo "-- to fastq, dump other -------------------------------------"
time samtools fastq -@ $THREADS $OUT/${SAMPLE}_bt2_decontamd_sort.bam -1 $OUT/${SAMPLE}_bt2decon_R1.fastq.gz -2 $OUT/${SAMPLE}_bt2decon_R2.fastq.gz -0 /dev/null -s /dev/null -n
## shrink / delete
#if [ -s $OUT/${SAMPLE}_bt2decon_R1.fastq.gz ] && [ -s $OUT/${SAMPLE}_bt2decon_R2.fastq.gz ] ;
# #echo "-- deleting *AM --------------------" ;
# #then rm $OUT/${SAMPLE}_*.[bam,sam] & ;
# echo "-- shrinking *AM --------------------" ;
# then gzip $OUT/${SAMPLE}_*.[bam,sam] & ;
#fi
' > $MAT/slurm_deco_full.sh # SAMPLE IN OUT REF THREADS
# send just one sample to the script with "head -1"
for i in $( cat $INES/Materials/samples | head -1 );
do sbatch $MAT/slurm_deco_full.sh $i $FILT $KDOUT $BT_DB 10 ;
done
## 4.a k r a k e n 2 =============================================
## test one sample ----------------------------
# set how many jobs we plan on using
KR_threads=5
# test
time kraken2 --db $K2REFDIR \
$KDOUT/${TEST}_bt2decon_R1.fastq.gz \
$KDOUT/${TEST}_bt2decon_R2.fastq.gz \
--paired \
--confidence 0.5 \
--minimum-hit-groups 5 \
--minimum-base-quality 20 \
--gzip-compressed \
--threads $KR_threads \
--report-zero-counts \
--report $KROUT/${TEST}_test_kraken_report \
--unclassified-out $KROUT/${TEST}_test_kraken_unclass# \
--output $KROUT/${TEST}_test_kraken_output
# observe:
less -S $KROUT/${TEST}_kraken_report
## fire all samples ----------------------------
echo '#!/bin/bash
#SBATCH --job-name=krak2_filt
#SBATCH --output=krak2_filt.%j.txt
#SBATCH --time=15:00
SAMPLE=$1
IN=$2
OUT=$3
DB=$4
THREADS=$5
## we do NOT want to use this! Bracken wont read it
#--use-mpa-style \
time kraken2 --db $DB \
$IN/${SAMPLE}_bt2decon_R1.fastq.gz \
$IN/${SAMPLE}_bt2decon_R2.fastq.gz \
--paired \
--confidence 0.15 \
--report-zero-counts \
--threads $THREADS \
--gzip-compressed \
--report $OUT/${SAMPLE}_kraken_report \
--unclassified-out /dev/null/${SAMPLE}_kraken_unclass# \
--output /dev/null/${SAMPLE}_kraken_output
' > $MAT/slurm_krak2.sh # SAMPLE IN OUT DB THREADS
for i in $( cat $INES/Materials/samples );
do sbatch $MAT/slurm_krak2.sh $i $KDOUT $KROUT $K2REFDIR $KR_threads ;
done
## 4.b b r a c k e n ================================
## build data base first ----------------------------
BR_kmer=35 # this is the default kmer length of the Kraken2 DB on the HPC
BR_leng=125 # length post-trimm
# need to make a bit of a hack here
mkdir ~/bin ; cd ~/bin &&
wget https://github.com/jenniferlu717/Bracken/archive/refs/heads/master.zip ; unzip master.zip && rm master.zip
cd ~/bin/Bracken-master
chmod +x install_bracken.sh ; ./install_bracken.sh
# next, *symlink* to the K2 DB as not sure can write out to main ref dir / $INES
module load kraken2/2.1.1
module unload braken
ln -s $K2REFDIR/opts.k2d $WRK/ref/opts.k2d
ln -s $K2REFDIR/hash.k2d $WRK/ref/hash.k2d
ln -s $K2REFDIR/taxo.k2d $WRK/ref/taxo.k2d
ln -s $K2REFDIR/seqid2taxid.map $WRK/ref/seqid2taxid.map
ln -s $K2REFDIR/library $WRK/ref/library
ln -s $K2REFDIR/taxonomy $WRK/ref/taxonomy
cd ~
echo '#!/bin/bash
#SBATCH --job-name=brack_db
#SBATCH --njobs=35
#SBATCH --output=brack_db.%j.txt
#SBATCH --time=240:00
BRDB_DIR=$1
BRDB_KMER=$2
BRDB_LENG=$3
BRDB_THREADS=$4
## not tested!
time ./bracken-build -d $BRDB_DIR -k $BRDB_KMER -l $BRDB_LENG -t $BRDB_THREADS
' > $MAT/slurm_brak_db.sh # dir kmer length threads
sbatch slurm_brak_db.sh $MSDAT/ref/kraken2 $BR_kmer $BR_leng 20
## run bracken ----------------------------
# crucial parameters! fee free to adapt s and l. r probably more use fi read length differs from DB being used
BR_r=125
BR_l=S
BR_t=10 # counts! not threads
# run bracken, one sample
bracken -d $WRK/ref/ -i $KROUT/${TEST}_kraken_report -o $KROUT/${TEST}.bracken -r $BR_r -l $BR_l -t $BR_t
# note the header for the different columns:
head -1 $KROUT/${TEST}.bracken
less -S $KROUT/${TEST}.bracken+
# run on all samples
for i in $(cat $MAT/samples);
do bracken -d $K2REFDIR/ -i $KROUT/${i}_kraken_report -o $KROUT/${i}.bracken -r $BR_r -l $BR_l -t $BR_t ;
done
### combine samples ------------------------------------------
## --- code not yet sane ---------------------------------
## -----------------------------------------------------------
### export to R-----------------------------------------------
## --- code not yet sane ---------------------------------
## -----------------------------------------------------------