### Mapping to reference genome hg38 or dm6 using Bowtie2 ###

bowtie2 -x $INDEX_PATH/genome -p 24 -1 $SEQ_PATH/*Lib20_1*.gz -2 $SEQ_PATH/*Lib20_2*.gz | samtools view -bS > K562_K27Me3_Abcam_REP1.bam
bowtie2 -x $INDEX_PATH/genome -p 24 -1 $SEQ_PATH/*Lib21_1*.gz -2 $SEQ_PATH/*Lib21_2*.gz | samtools view -bS > K562_K27Me3_Abcam_REP2.bam


### Sorting mapped reads with respect to position ###

samtools sort K562_K27Me3_Abcam_REP1.bam > $SEQ_PATH/sorted/K562_K27Me3_Abcam_REP1_sorted.bam


### Indexing sorted reads ###

samtools index $SEQ_PATH/sorted/K562_K27Me3_Abcam_REP1_sorted.bam


### Generating BigWig files using DeepTools (v 3.1.3)  ###

BASE_NAME=$(basename ${i::-4})

for i in  $SEQ_PATH/*sorted.bam;

do

/home/pmore/Piyush_Project/Softwares/Deeptools/bin/bamCoverage -b $i --normalizeUsing RPKM -bs 50 --smoothLength 175 -o $SEQ_PATH/bw-normalized/${BASE_NAME%}.bw --numberOfProcessors 16

done

### Peak calling for narrow marks ( version 2.1.1.20160309) ###

Macs2 callpeak -t $File -c $File -f BAMPE -g dm --outdir --name
Macs2 callpeak -t $File -c $File -f BAMPE -g hs --outdir --name

### Peak calling for broad marks ( version 2.1.1.20160309) ###

Macs2 callpeak -t $File -c $File -f BAMPE -g dm --broad --outdir --name
Macs2 callpeak -t $File -c $File -f BAMPE -g hs --broad --outdir --name

### Peak calling without FDR restriction ( version  2.1.1.20160309, used for ROC and PRC curves) ###

Macs2 callpeak -t $File -c $File -f BAMPE -g dm -q 0.99 --outdir --name
Macs2 callpeak -t $File -c $File -f BAMPE -g hs -q 0.99 --outdir --name

Macs2 callpeak -t $File -c $File -f BAMPE -g dm --broad --broad-cutoff 0.99 --outdir --name
Macs2 callpeak -t $File -c $File -f BAMPE -g hs --broad --broad-cutoff 0.99 --outdir --name

#### Pearsons correlation plots (Deeptools v 3.1.3) ####

For K562 cells
multiBigwigSummary bins -b *.bw --labels $define -bs 2000 -out $OUT_DIR/input.npz
plotCorrelation -in $input.npz --corMethod pearson --log1p --removeOutliers --skipZeros --plotTitle "Pearson Correlation of Replicates" --whatToPlot scatterplot -o $file --outFileCorMatrix $matrix

For Drosophila NSCs
multiBigwigSummary bins -b *.bw --labels $define -bs 500 -out $OUT_DIR/input.npz
plotCorrelation -in $input.npz --corMethod pearson --log1p --removeOutliers --skipZeros --plotTitle "Pearson Correlation of Replicates" --whatToPlot scatterplot -o $file --outFileCorMatrix $matrix


###### Scripts for generating genomic windows ####

#!/usr/bin/env python3

'''
	This script is used to create genomic windows given a peak file.
	
	Parameters
	----------
	peak_file_path : str
		Path to the peak file
	window_file_path : str
		Path to the output file containing the windows
	assembly : str 
		The assembly used ("GRCh38" or "dm6")
	window_size : int
		Size of the windows (1000 for drosophila, 5000 for human)
	
	date:	2018-01-21
	author:	Steffen Albrecht
	example:
			python genomic_windows.py 
			./data/NSC/FDR5/H3K4Me3/peaks/conv_K4Me3_1_peaks.bed 
			./data/NSC/FDR5/H3K4Me3/windows/1kb/conv_K4Me3_1.tsv 
			dm6 1000
'''

from sys import *

global peak_file_path, window_file_path, assembly, window_size
peak_file_path = argv[1]
window_file_path = argv[2]
assembly = argv[3]
window_size = int(argv[4])


# Chromosmes used for the different reference genomes (assembly) for human 
# and drosophila. Chromosome sizes were taken from the UCSC website.

global chrom_used, chrom_sizes
chrom_used = {}
chrom_used['dm6'] = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 
					 'chr4', 'chrM', 'chrX', 'chrY']
chrom_used['GRCh38'] = ['chr%d'%(i) for i in range(1,23)] + ['chrX', 'chrY']

chrom_sizes = {}
chrom_sizes['dm6'] = {'chr2L': 23513712, 'chr2R': 25286936,
					  'chr3L': 28110227, 'chr3R': 32079331,
					  'chr4': 1348131, 'chrM': 19524,
					  'chrX': 23542271, 'chrY': 3667352}

chrom_sizes['GRCh38'] = {'chr1': 248956422, 'chr2': 242193529, 
						 'chr3': 198295559, 'chr4': 190214555, 
						 'chr5': 181538259, 'chr6': 170805979, 
						 'chr7': 159345973, 'chr8': 145138636, 
						 'chr9': 138394717, 'chr10': 133797422, 
						 'chr11': 135086622, 'chr12': 133275309, 
						 'chr13': 114364328, 'chr14': 107043718, 
						 'chr15': 101991189, 'chr16': 90338345, 
						 'chr17': 83257441, 'chr18': 80373285, 
						 'chr19': 58617616, 'chr20': 64444167, 
						 'chr21': 46709983, 'chr22': 50818468, 
						 'chrX': 156040895, 'chrY': 57227415, }

def get_peaks(file_path):
	'''
	Reads in the peaks grouped by chromosmes.
	
	Given the used chromosomes defined above this function parses a file 
	usually a .bed file and collects the position and p-value of all
	peaks listed in the file.
	
	Parameters
	----------
	file_path : str
		Path to the peak file
	
	Returns
	-------
	dict
		A dictionary containing a list of peaks grouped by chromosome
		a peak is a tuple containing the peaks start and end position
		as well as the chromosome and its p-value.
		
    '''
	peaks = {}
	# only selected chromosomes are considered
	for chrom in chrom_used[assembly]:
		peaks[chrom] = []
	with open(file_path, 'r') as f:
		for line in f:
			line = line.strip().split('\t')
			chrom = line[0]
			# an additional if clause, because peak files for H3K9Me3 in this case are slightly different
			if 'H3K9Me3' in file_path and 'K562' in file_path and 'FDR5' in file_path:
				chrom = 'chr' + chrom 
			start = int(line[1])
			end = int(line[2])
			# bed files keep the p-value on column with index 7
			enrichment = float(line[7])
			if chrom in set(chrom_used[assembly]):
				peaks[chrom].append((chrom, start, end, enrichment))
	return peaks

def get_windows(peaks):
	'''
	Given the peaks in a dictionary it creates the windows
	
	Windows are created according to the chromosome sizes of the given
	assembly. For each window an overlapping peak is searched.
	
	Parameters
	----------
	peaks : str
		A dictionary containing lists of peaks grouped by chromosome
	
	Returns
	-------
	list
		A list containing the genomic windows. Each window is descirbed
		by a tuple containing the windows ID and the associtated p-value.
		
    '''
	windows = []
	window_ID = 1
	for chrom in chrom_used[assembly]:
		chrom_size = chrom_sizes[assembly][chrom]
		window_end = window_size
		while window_end < chrom_size:
			# create a start and check all windows 
			# for the current chromosome 
			window_start = window_end - window_size
			enrichments = []
			for peak in peaks[chrom]:
				if peak[2] > window_start:
					if peak[1] < window_end:
						enrichments.append(peak[3])
					# early exit: since peaks are sorted the loop can be 
					# broken when the start of the window is "behind" the 
					# end of the current peak
					else:
						break
			
			# check the current window of possible overlapping peaks
			if len(enrichments) != 0:
				windows.append((window_ID, max(enrichments)))
			
			# update the running values
			window_end += window_size
			window_ID += 1
	print('Maximum windowID:', window_ID)
	return windows

# reads in the peaks
peaks = get_peaks(peak_file_path)
# creates the windows
windows = get_windows(peaks)

# write out the windows to the given output file path in tsv-format
with open(window_file_path, 'w') as f:
	for window in windows:
		f.write(str(window[0]) + '\t' + str(window[1]) + '\n')


###### Scripts for ROC and PRC curves ####

#!/usr/bin/env python3

'''
	This script is used to plot ROC as well as PR curves.
	
	Curves are plotted with different settings with respect to the straight
	line that is present because of the fact that at a certain decision 
	threshold all possible reference windows are "true". The plots are also 
	again plotted with flipped PR curve to get the recall on the y-axis.
	
	Parameters
	----------
	input_window_files : str
		Path to the file containing the dataset names (methods) and the path 
		to the file with the genomic windows. Important: always the very 
		first line is used as the reference dataset.
	assembly : str
		The assembly used to create the windows ("GRCh38" or "dm6")
	window_size : int
		Size of the windows (1000 for drosophila, 5000 for human)
	
	date:	2018-01-21
	author:	Steffen Albrecht
	example:
			python plot_ROCc_PRc.py ./NSC_H3K4Me3_noFDR.tsv dm6 1000
			
			example for input file:
				Conv R1	./data/NSC/noFDR/H3K4Me3/windows/1kb/Conv_K4Me3_Rep1.txt
				Conv R2	./data/NSC/noFDR/H3K4Me3/windows/1kb/Conv_K4Me3_Rep2.txt
				TAF Tum R1	./data/NSC/noFDR/H3K4Me3/windows/1kb/TAF_Tum_K4Me3_Rep1.txt
				TAF Tum R2	./data/NSC/noFDR/H3K4Me3/windows/1kb/TAF_Tum_K4Me3_Rep2.txt
'''

from sys import *
from sklearn.metrics import auc
import matplotlib.pyplot as plt 
import seaborn as sns

import util as util

# total number of windows depending on assembly and window size
n_windows = {}
n_windows['dm6'] = {1000: 137565}
n_windows['GRCh38'] = {5000: 617642}

# get the command line arguments
input_window_files = argv[1]
assembly = argv[2]
window_size = int(argv[3])

# parse the input file for all datasets represented by genomic windows
reference_name = None
reference_set = None
methods = []
datasets = {}
with open(input_window_files, 'r') as f:
	for i, line in enumerate(f):
		line = line.strip().split('\t')
		method = line[0]
		window_file_path = line[1]
		# read in the windows and convert into sets
		windows, min_enrich, max_enrich = util.read_windows(window_file_path)
		# read in the windows, max and min enrichment of each method and the 
		# reference, the first line is always the reference dataset
		if i == 0:
			reference_name = method
			reference_set = {'windows': windows, 'min_enrich': min_enrich, 
					'max_enrich': max_enrich}
		else:
			datasets[method] = {'windows': windows, 'min_enrich': min_enrich, 
					'max_enrich': max_enrich}
			methods.append(method)

# start to calculate the measures needed to plot the curves
refPs = len(reference_set['windows']) 
refNs = n_windows[assembly][window_size] - refPs
balance = float(refPs) / float(n_windows[assembly][window_size])
ref_window_set = set([window[0] for window in reference_set['windows']])

resolution = 10000 # resultion of the changing decision threshold
# values needed to plot the ROC and PR curves  are collected within
# a dict, grouped by the method
curves = {} 
for method in methods:
	precisions = [0.0] 
	recalls = [1.0]
	FPRs = [1.0]
	
	# for different decison thresholds calculate recall, FPR, and precision
	# thresholds are calculated based on the enrichment values (p-value)
	for dt in range(0,(resolution+1),1):
		decision_threshold = dt / float(resolution)
		enrichment_threshold = decision_threshold * (datasets[method]['max_enrich']
											   - datasets[method]['min_enrich'])
		enrichment_threshold += datasets[method]['min_enrich']
		
		TP = 0
		FP = 0
		for window in datasets[method]['windows']:
			if window[1] >= enrichment_threshold:
				if window[0] in ref_window_set:
					TP += 1
				else:
					FP += 1
		FN = refPs - TP
		TN = refNs - FP
		
		# based in TP, FP, TN, FN calculate precision, recall (TPR), and FPR
		prec = util.secure_division(TP, (TP + FP), -1)
		recall = util.secure_division(TP, (TP + FN), -1)
		fpr = util.secure_division(FP, (FP + TN), -1)
		
		# skip if one measure could not be calculated
		if prec == -1 or recall == -1 or fpr == -1:
			continue
		
		precisions.append(prec)
		recalls.append(recall)
		FPRs.append(fpr)
	precisions.append(1.0)
	recalls.append(0.0)
	FPRs.append(0.0)
	
	# collect everything needed to plot the curves
	curve = {'method': method}
	curve['precision'] = precisions
	curve['recall'] = recalls
	curve['FPR'] = FPRs
	curve['auROC'] = auc(FPRs, recalls)
	curve['auPRC'] = auc(recalls, precisions)
	curve['balance'] = balance
	curves[method] = curve
	print('Method:\t%s'%(method))
	print('\tauROC:  %.2f'%(curve['auROC']))
	print('\tauPRC:  %.2f\n'%(curve['auPRC']))


# plot ROC and PRC on one page
curves_width = 2
colors = list(sns.color_palette('hls', len(methods)))

# flipped: Either "default" (without flipping) or "flipped" plotting the 
# PR curve with flipped x and y axis to get the recall on y-axis.
# ended: Both, ROC and PR curve, have a straight ending line. There are
# 3 opportunities to  plot this straight line:
#	default: normal straight line
#	noended: not plotting this line
#	dashed:  plotting this line dashed

for flipped in ['default', 'flipped']:
	for ended in ['default', 'noended', 'dashed']:
		plt.figure(figsize=(16,7.8))
		
		# ROC curve
		plt.subplot(121)
		for i, method in enumerate(methods):
			curve = curves[method]
			x = curve['FPR']
			y = curve['recall']
			
			if ended == 'noended':
				x = x[1:-1]
				y = y[1:-1]
			
			if ended == 'dashed':
				dashed = [[x[0], x[1]]]
				dashed.append([y[0], y[1]])
				x = x[1:-1]
				y = y[1:-1]
				plt.plot(dashed[0], dashed[1], 'k--', lw=1, color=colors[i])
			
			plt.plot(x, y, color=colors[i], 
				linewidth=curves_width, linestyle='-', 
				label='%s | ROC: %.2f | PRC: %.2f'%(curve['method'], curve['auROC'], curve['auPRC']))
		plt.plot([0, 1], [0, 1], 'k--', lw=1)
		plt.xlim([0.0, 1.01])
		plt.ylim([0.0, 1.01])
		plt.xlabel('FPR')
		plt.ylabel('TPR')
		plt.title('ROC-curves')
		plt.legend(loc="lower right")
		super_title = '%s'%(input_window_files.replace('.tsv', ''))
		plt.suptitle(super_title)
		
		# PRC curve
		plt.subplot(122)
		for i, method in enumerate(methods):
			curve = curves[method]
			
			x = curve['recall']
			y = curve['precision']
			if flipped == 'flipped':
				x = curve['precision']
				y = curve['recall']
			
			if ended == 'noended':
				x = x[1:-1]
				y = y[1:-1]
			
			if ended == 'dashed':
				dashed = [[x[0], x[1]]]
				dashed.append([y[0], y[1]])
				x = x[1:-1]
				y = y[1:-1]
				plt.plot(dashed[0], dashed[1], 'k--', lw=1, color=colors[i])
				
			plt.plot(x, y, color=colors[i], 
				linewidth=curves_width, linestyle='-', 
				label='%s - auPRC: %.3f'%(curve['method'], curve['auPRC']))
		x_label = 'Recall'
		y_label = 'Precision'
		if flipped == 'flipped':
			x_label = 'Precision'
			y_label = 'Recall'
			plt.plot([curve['balance'], curve['balance']], [0,1], 'k--', lw=1)
		else:
			plt.plot([0, 1], [curve['balance'], curve['balance']], 'k--', lw=1)
		
		plt.xlim([0.0, 1.01])
		plt.ylim([0.0, 1.01])
		
		plt.xlabel(x_label)
		plt.ylabel(y_label)
		plt.title('PRC-curves')
		
		# adjust the subplots
		plt.subplots_adjust(top=0.9, bottom=0.1, left=0.07, right=0.95, hspace=0.3,
						wspace=0.25)
		
		# save the plot as pdf in folder "curves"
		plt.savefig('./curves/%s_%s_%s'%(flipped, ended, input_window_files.replace('.tsv', '.pdf')))


###### Scripts for generating clusterogram ####

#!/usr/bin/env python3

'''
	Given the different datasets within the input file the distances are
	calculated and a clustermap is plotted
	
	Parameters
	----------
	input_window_files : str
		Path to the tab separated file containing the names of the datasets
		and the file paths to the files containing the genomic windows
	
	date:	2018-01-21
	author:	Steffen Albrecht
	example:
			python clusterogram.py NSC_H3K4Me3_FDR5.tsv
			
			example for the input file:
				Conv R1	./data/NSC/FDR5/H3K4Me3/windows/1kb/conv_K4Me3_1_peaks.txt
				Conv R2	./data/NSC/FDR5/H3K4Me3/windows/1kb/conv_K4Me3_2_peaks.txt
				TAF Tum R1	./data/NSC/FDR5/H3K4Me3/windows/1kb/Tum_1k_K4Me3_1_peaks.txt
				...	...
'''

from sys import *
import pandas as pd
import seaborn as sns

import util as util

global input_window_files
input_window_files = argv[1]

# parse the input file for all datasets represented by genomic windows
methods = []
datasets = {}
with open(input_window_files, 'r') as f:
	for line in f:
		line = line.strip().split('\t')
		method = line[0]
		window_file_path = line[1]
		# read in the windows and convert into sets
		windows, min_enrich, max_enrich = util.read_windows(window_file_path)
		datasets[method] = set([window[0] for window in windows])
		methods.append(method)

# calculate the Jaccard Indices and use them as distance
# as preprocessing for the clustermap
data_dict = {}
indexes = []
for method1 in methods:
	similarities = []
	for method2 in methods:
		ji = 1.0 - util.jaccard_index(datasets[method1], datasets[method2])
		similarities.append(ji)
	data_dict[method1] = similarities
	indexes.append(method1)

# plotting the clustermap from seaborn with average linkage clustering
cm_data = pd.DataFrame(data_dict, index = indexes)
clustermap = sns.clustermap(cm_data, linewidth=2, method='average', figsize=(8,8), cmap='Blues_r', col_cluster=True)
# the title is simply the name of the given file
plot_title = '%s\n'%(input_window_files)
clustermap.fig.suptitle(plot_title)
clustermap.savefig('./%s'%(input_window_files.replace('.tsv', '.pdf')))