#!/bin/bash # tagmap shell script # author: David L. Stern # Janelia Research Campus # HHMI # Ashburn, VA # 9 January 2016 # # Filters and maps reads, finds overlapping forward and reverse reads, plots overlap regions at low and high resolution # Operates on a folder full of *fastq.gz files generated by the TagMap molecular protocol # Currently uses bwa-mem for short-read mapping. This works great for most transposons, which duplicate the target site. # # Variables: # # To detect transposons that do not duplicate the insertion site or genome duplications, set parameter "shift" to > 0. # This will shift reverse reads "shift" bp to the left and then test for overlap. # # To detect the orientation of the transposable element or duplicated sequence, provide the first N bp from the 5' of the sequence # # dependencies available to system: # prinseq-lite-0.20.4 (http://prinseq.sourceforge.net) # Make prinseq-lite executable # Navigate to folder and type # chmod 755 prinseq-lite.pl # bwa 0.7.12-r1039 (http://bio-bwa.sourceforge.net/bwa.shtml) # samtools 1.3 (http://samtools.sourceforge.net) # gnuplot 5.0 patchlevel 1 (http://www.gnuplot.info) # run as ./tagmap mapping_genome=$1 shift=0 seq=CCCTAGAAAGATAGT #prepare bwa index of relevant genome if [ -e ${mapping_genome}.sa ]; then echo bwa index already built. else echo building bwa index. bwa index $mapping_genome fi #unzip all fastq.gz files gunzip *fastq.gz for filename in *.fastq; do #remove PCR duplicates #make prinseq-lite executable and put in executable #prinseq automatically appends a fast suffix to output prinseq-lite.pl -derep 12 -fastq ${filename} -out_good ${filename}.good -out_bad null #map reads to genome, convert to bam, and sort #Program: bwa (alignment via Burrows-Wheeler transformation) #Version: 0.7.12-r1039 bwa mem $mapping_genome ${filename}.good.fastq | samtools view -b /dev/stdin | samtools sort /dev/stdin > ${filename}.good.fastq.sorted.bam #make bai file samtools index ${filename}.good.fastq.sorted.bam #Produce two files, with depth for reads in opposite orientation samtools view -F 0x10 ${filename}.good.fastq.sorted.bam -b| samtools depth /dev/stdin > ${filename}.good.fastq.sorted.fwd.depth samtools view -f 0x10 ${filename}.good.fastq.sorted.bam -b| samtools depth /dev/stdin | awk -v shift=$shift '{print $1 "\t" ($2 - shift) "\t" $3}' > ${filename}.good.fastq.sorted.rev.depth #grab just positions cut -f 1-2 ${filename}.good.fastq.sorted.fwd.depth | sort > ${filename}.good.fastq.sorted.fwd.pos cut -f 1-2 ${filename}.good.fastq.sorted.rev.depth | sort > ${filename}.good.fastq.sorted.rev.pos #find sites that are covered by both forward and reverse reads comm -12 ${filename}.good.fastq.sorted.rev.pos ${filename}.good.fastq.sorted.fwd.pos > ${filename}.overlap #find starts of overlap regions awk '$2!=p+1{print $1 "\t" $2}{p=$2}' ${filename}.overlap > ${filename}.overlap_starts #append # reads to third column for overlaps awk 'FNR==NR{a[$2];next}($2 in a){print}' ${filename}.overlap_starts ${filename}.good.fastq.sorted.fwd.depth > ${filename}.overlap_for_depth awk 'FNR==NR{a[$2];next}($2 in a){print}' ${filename}.overlap_starts ${filename}.good.fastq.sorted.rev.depth > ${filename}.overlap_rev_depth #append both depths to single file join -j 2 ${filename}.overlap_for_depth ${filename}.overlap_rev_depth | cut -d " " -f 1,2,3,5 > ${filename}.overlap_depth #filter to sites with multiple reads on both strands awk '$3>1 && $4>1' ${filename}.overlap_depth > ${filename}.candidate_sites #to get orientation of inserts #first, select reads from original good.fastq file that contain user supplied '5 seq - that is, they adjoin the insert grep -B 1 -A 2 $seq ${filename}.good.fastq | sed '/^--$/d' | bwa mem $mapping_genome - |samtools view -b /dev/stdin|samtools view -b -F 4 /dev/stdin | samtools sort /dev/stdin > ${filename}.good.fastq.adjoin.bam samtools index ${filename}.good.fastq.adjoin.bam #Produce two files, with depth for reads in opposite orientation samtools view -F 0x10 ${filename}.good.fastq.adjoin.bam -b| samtools depth /dev/stdin > ${filename}.good.fastq.adjoin.fwd.depth samtools view -f 0x10 ${filename}.good.fastq.adjoin.bam -b| samtools depth /dev/stdin > ${filename}.good.fastq.adjoin.rev.depth #print out 2kb flanking using #for each candidate site, grab chromosome and position while read bp chrom fwd rev #grab values from four columns do #grab reads adjoining insertion for specific chrom grep ^$chrom ${filename}.good.fastq.adjoin.fwd.depth | cut -f 2-3 > ${filename}.plot_adjoined_forward grep ^$chrom ${filename}.good.fastq.adjoin.rev.depth | cut -f 2-3 > ${filename}.plot_adjoined_reverse #print figs only if some reads overlap transposable element seq if [[ -s ${filename}.plot_adjoined_forward ]] || [[ -s ${filename}.plot_adjoined_reverse ]]; then #grab position and depth from selected chromosome grep ^$chrom ${filename}.good.fastq.sorted.fwd.depth | cut -f 2-3 > ${filename}.plot_forward #add back shifted bp to positions for plotting grep ^$chrom ${filename}.good.fastq.sorted.rev.depth | cut -f 2-3 | awk -v shift=$shift '{print ($1 + shift) "\t" $2}' > ${filename}.plot_reverse #plot tview let start=bp-30 samtools tview -d T -p $chrom:$start ${filename}.good.fastq.sorted.bam $mapping_genome > ${filename}.$chrom.$bp.tview.txt #grab 1kb up and downstream of site from for.depth and rev.depth files let start=bp-1000 let stop=bp+1001 #plot pretty gnuplot -p -e "set terminal postscript color ; set xrange [$start:$stop]; unset key; plot '${filename}.plot_forward' with dots lw 5 linecolor rgb 'red',\ '${filename}.plot_reverse' with dots lw 5 linecolor rgb 'blue','${filename}.plot_adjoined_forward' with points pointtype 1 linecolor rgb '#F08080',\ '${filename}.plot_adjoined_reverse' with points pointtype 1 linecolor rgb '#00CED1'" > ${filename}.$chrom.$bp.gnuplot.ps fi; done < ${filename}.candidate_sites #comment out following line to troubleshoot the script rm ${filename}.good* ${filename}.overlap* ${filename}.candidate_sites ${filename}.plot* done