### Mapping to reference genome hg38 or dm6 using Bowtie2 ### bowtie2 -x $INDEX_PATH/genome -p 24 -1 $SEQ_PATH/*Lib20_1*.gz -2 $SEQ_PATH/*Lib20_2*.gz | samtools view -bS > K562_K27Me3_Abcam_REP1.bam bowtie2 -x $INDEX_PATH/genome -p 24 -1 $SEQ_PATH/*Lib21_1*.gz -2 $SEQ_PATH/*Lib21_2*.gz | samtools view -bS > K562_K27Me3_Abcam_REP2.bam ### Sorting mapped reads with respect to position ### samtools sort K562_K27Me3_Abcam_REP1.bam > $SEQ_PATH/sorted/K562_K27Me3_Abcam_REP1_sorted.bam ### Indexing sorted reads ### samtools index $SEQ_PATH/sorted/K562_K27Me3_Abcam_REP1_sorted.bam ### Generating BigWig files using DeepTools (v 3.1.3) ### BASE_NAME=$(basename ${i::-4}) for i in $SEQ_PATH/*sorted.bam; do /home/pmore/Piyush_Project/Softwares/Deeptools/bin/bamCoverage -b $i --normalizeUsing RPKM -bs 50 --smoothLength 175 -o $SEQ_PATH/bw-normalized/${BASE_NAME%}.bw --numberOfProcessors 16 done ### Peak calling for narrow marks ( version 2.1.1.20160309) ### Macs2 callpeak -t $File -c $File -f BAMPE -g dm --outdir --name Macs2 callpeak -t $File -c $File -f BAMPE -g hs --outdir --name ### Peak calling for broad marks ( version 2.1.1.20160309) ### Macs2 callpeak -t $File -c $File -f BAMPE -g dm --broad --outdir --name Macs2 callpeak -t $File -c $File -f BAMPE -g hs --broad --outdir --name ### Peak calling without FDR restriction ( version 2.1.1.20160309, used for ROC and PRC curves) ### Macs2 callpeak -t $File -c $File -f BAMPE -g dm -q 0.99 --outdir --name Macs2 callpeak -t $File -c $File -f BAMPE -g hs -q 0.99 --outdir --name Macs2 callpeak -t $File -c $File -f BAMPE -g dm --broad --broad-cutoff 0.99 --outdir --name Macs2 callpeak -t $File -c $File -f BAMPE -g hs --broad --broad-cutoff 0.99 --outdir --name #### Pearsons correlation plots (Deeptools v 3.1.3) #### For K562 cells multiBigwigSummary bins -b *.bw --labels $define -bs 2000 -out $OUT_DIR/input.npz plotCorrelation -in $input.npz --corMethod pearson --log1p --removeOutliers --skipZeros --plotTitle "Pearson Correlation of Replicates" --whatToPlot scatterplot -o $file --outFileCorMatrix $matrix For Drosophila NSCs multiBigwigSummary bins -b *.bw --labels $define -bs 500 -out $OUT_DIR/input.npz plotCorrelation -in $input.npz --corMethod pearson --log1p --removeOutliers --skipZeros --plotTitle "Pearson Correlation of Replicates" --whatToPlot scatterplot -o $file --outFileCorMatrix $matrix ###### Scripts for generating genomic windows #### #!/usr/bin/env python3 ''' This script is used to create genomic windows given a peak file. Parameters ---------- peak_file_path : str Path to the peak file window_file_path : str Path to the output file containing the windows assembly : str The assembly used ("GRCh38" or "dm6") window_size : int Size of the windows (1000 for drosophila, 5000 for human) date: 2018-01-21 author: Steffen Albrecht example: python genomic_windows.py ./data/NSC/FDR5/H3K4Me3/peaks/conv_K4Me3_1_peaks.bed ./data/NSC/FDR5/H3K4Me3/windows/1kb/conv_K4Me3_1.tsv dm6 1000 ''' from sys import * global peak_file_path, window_file_path, assembly, window_size peak_file_path = argv[1] window_file_path = argv[2] assembly = argv[3] window_size = int(argv[4]) # Chromosmes used for the different reference genomes (assembly) for human # and drosophila. Chromosome sizes were taken from the UCSC website. global chrom_used, chrom_sizes chrom_used = {} chrom_used['dm6'] = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr4', 'chrM', 'chrX', 'chrY'] chrom_used['GRCh38'] = ['chr%d'%(i) for i in range(1,23)] + ['chrX', 'chrY'] chrom_sizes = {} chrom_sizes['dm6'] = {'chr2L': 23513712, 'chr2R': 25286936, 'chr3L': 28110227, 'chr3R': 32079331, 'chr4': 1348131, 'chrM': 19524, 'chrX': 23542271, 'chrY': 3667352} chrom_sizes['GRCh38'] = {'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, } def get_peaks(file_path): ''' Reads in the peaks grouped by chromosmes. Given the used chromosomes defined above this function parses a file usually a .bed file and collects the position and p-value of all peaks listed in the file. Parameters ---------- file_path : str Path to the peak file Returns ------- dict A dictionary containing a list of peaks grouped by chromosome a peak is a tuple containing the peaks start and end position as well as the chromosome and its p-value. ''' peaks = {} # only selected chromosomes are considered for chrom in chrom_used[assembly]: peaks[chrom] = [] with open(file_path, 'r') as f: for line in f: line = line.strip().split('\t') chrom = line[0] # an additional if clause, because peak files for H3K9Me3 in this case are slightly different if 'H3K9Me3' in file_path and 'K562' in file_path and 'FDR5' in file_path: chrom = 'chr' + chrom start = int(line[1]) end = int(line[2]) # bed files keep the p-value on column with index 7 enrichment = float(line[7]) if chrom in set(chrom_used[assembly]): peaks[chrom].append((chrom, start, end, enrichment)) return peaks def get_windows(peaks): ''' Given the peaks in a dictionary it creates the windows Windows are created according to the chromosome sizes of the given assembly. For each window an overlapping peak is searched. Parameters ---------- peaks : str A dictionary containing lists of peaks grouped by chromosome Returns ------- list A list containing the genomic windows. Each window is descirbed by a tuple containing the windows ID and the associtated p-value. ''' windows = [] window_ID = 1 for chrom in chrom_used[assembly]: chrom_size = chrom_sizes[assembly][chrom] window_end = window_size while window_end < chrom_size: # create a start and check all windows # for the current chromosome window_start = window_end - window_size enrichments = [] for peak in peaks[chrom]: if peak[2] > window_start: if peak[1] < window_end: enrichments.append(peak[3]) # early exit: since peaks are sorted the loop can be # broken when the start of the window is "behind" the # end of the current peak else: break # check the current window of possible overlapping peaks if len(enrichments) != 0: windows.append((window_ID, max(enrichments))) # update the running values window_end += window_size window_ID += 1 print('Maximum windowID:', window_ID) return windows # reads in the peaks peaks = get_peaks(peak_file_path) # creates the windows windows = get_windows(peaks) # write out the windows to the given output file path in tsv-format with open(window_file_path, 'w') as f: for window in windows: f.write(str(window[0]) + '\t' + str(window[1]) + '\n') ###### Scripts for ROC and PRC curves #### #!/usr/bin/env python3 ''' This script is used to plot ROC as well as PR curves. Curves are plotted with different settings with respect to the straight line that is present because of the fact that at a certain decision threshold all possible reference windows are "true". The plots are also again plotted with flipped PR curve to get the recall on the y-axis. Parameters ---------- input_window_files : str Path to the file containing the dataset names (methods) and the path to the file with the genomic windows. Important: always the very first line is used as the reference dataset. assembly : str The assembly used to create the windows ("GRCh38" or "dm6") window_size : int Size of the windows (1000 for drosophila, 5000 for human) date: 2018-01-21 author: Steffen Albrecht example: python plot_ROCc_PRc.py ./NSC_H3K4Me3_noFDR.tsv dm6 1000 example for input file: Conv R1 ./data/NSC/noFDR/H3K4Me3/windows/1kb/Conv_K4Me3_Rep1.txt Conv R2 ./data/NSC/noFDR/H3K4Me3/windows/1kb/Conv_K4Me3_Rep2.txt TAF Tum R1 ./data/NSC/noFDR/H3K4Me3/windows/1kb/TAF_Tum_K4Me3_Rep1.txt TAF Tum R2 ./data/NSC/noFDR/H3K4Me3/windows/1kb/TAF_Tum_K4Me3_Rep2.txt ''' from sys import * from sklearn.metrics import auc import matplotlib.pyplot as plt import seaborn as sns import util as util # total number of windows depending on assembly and window size n_windows = {} n_windows['dm6'] = {1000: 137565} n_windows['GRCh38'] = {5000: 617642} # get the command line arguments input_window_files = argv[1] assembly = argv[2] window_size = int(argv[3]) # parse the input file for all datasets represented by genomic windows reference_name = None reference_set = None methods = [] datasets = {} with open(input_window_files, 'r') as f: for i, line in enumerate(f): line = line.strip().split('\t') method = line[0] window_file_path = line[1] # read in the windows and convert into sets windows, min_enrich, max_enrich = util.read_windows(window_file_path) # read in the windows, max and min enrichment of each method and the # reference, the first line is always the reference dataset if i == 0: reference_name = method reference_set = {'windows': windows, 'min_enrich': min_enrich, 'max_enrich': max_enrich} else: datasets[method] = {'windows': windows, 'min_enrich': min_enrich, 'max_enrich': max_enrich} methods.append(method) # start to calculate the measures needed to plot the curves refPs = len(reference_set['windows']) refNs = n_windows[assembly][window_size] - refPs balance = float(refPs) / float(n_windows[assembly][window_size]) ref_window_set = set([window[0] for window in reference_set['windows']]) resolution = 10000 # resultion of the changing decision threshold # values needed to plot the ROC and PR curves are collected within # a dict, grouped by the method curves = {} for method in methods: precisions = [0.0] recalls = [1.0] FPRs = [1.0] # for different decison thresholds calculate recall, FPR, and precision # thresholds are calculated based on the enrichment values (p-value) for dt in range(0,(resolution+1),1): decision_threshold = dt / float(resolution) enrichment_threshold = decision_threshold * (datasets[method]['max_enrich'] - datasets[method]['min_enrich']) enrichment_threshold += datasets[method]['min_enrich'] TP = 0 FP = 0 for window in datasets[method]['windows']: if window[1] >= enrichment_threshold: if window[0] in ref_window_set: TP += 1 else: FP += 1 FN = refPs - TP TN = refNs - FP # based in TP, FP, TN, FN calculate precision, recall (TPR), and FPR prec = util.secure_division(TP, (TP + FP), -1) recall = util.secure_division(TP, (TP + FN), -1) fpr = util.secure_division(FP, (FP + TN), -1) # skip if one measure could not be calculated if prec == -1 or recall == -1 or fpr == -1: continue precisions.append(prec) recalls.append(recall) FPRs.append(fpr) precisions.append(1.0) recalls.append(0.0) FPRs.append(0.0) # collect everything needed to plot the curves curve = {'method': method} curve['precision'] = precisions curve['recall'] = recalls curve['FPR'] = FPRs curve['auROC'] = auc(FPRs, recalls) curve['auPRC'] = auc(recalls, precisions) curve['balance'] = balance curves[method] = curve print('Method:\t%s'%(method)) print('\tauROC: %.2f'%(curve['auROC'])) print('\tauPRC: %.2f\n'%(curve['auPRC'])) # plot ROC and PRC on one page curves_width = 2 colors = list(sns.color_palette('hls', len(methods))) # flipped: Either "default" (without flipping) or "flipped" plotting the # PR curve with flipped x and y axis to get the recall on y-axis. # ended: Both, ROC and PR curve, have a straight ending line. There are # 3 opportunities to plot this straight line: # default: normal straight line # noended: not plotting this line # dashed: plotting this line dashed for flipped in ['default', 'flipped']: for ended in ['default', 'noended', 'dashed']: plt.figure(figsize=(16,7.8)) # ROC curve plt.subplot(121) for i, method in enumerate(methods): curve = curves[method] x = curve['FPR'] y = curve['recall'] if ended == 'noended': x = x[1:-1] y = y[1:-1] if ended == 'dashed': dashed = [[x[0], x[1]]] dashed.append([y[0], y[1]]) x = x[1:-1] y = y[1:-1] plt.plot(dashed[0], dashed[1], 'k--', lw=1, color=colors[i]) plt.plot(x, y, color=colors[i], linewidth=curves_width, linestyle='-', label='%s | ROC: %.2f | PRC: %.2f'%(curve['method'], curve['auROC'], curve['auPRC'])) plt.plot([0, 1], [0, 1], 'k--', lw=1) plt.xlim([0.0, 1.01]) plt.ylim([0.0, 1.01]) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC-curves') plt.legend(loc="lower right") super_title = '%s'%(input_window_files.replace('.tsv', '')) plt.suptitle(super_title) # PRC curve plt.subplot(122) for i, method in enumerate(methods): curve = curves[method] x = curve['recall'] y = curve['precision'] if flipped == 'flipped': x = curve['precision'] y = curve['recall'] if ended == 'noended': x = x[1:-1] y = y[1:-1] if ended == 'dashed': dashed = [[x[0], x[1]]] dashed.append([y[0], y[1]]) x = x[1:-1] y = y[1:-1] plt.plot(dashed[0], dashed[1], 'k--', lw=1, color=colors[i]) plt.plot(x, y, color=colors[i], linewidth=curves_width, linestyle='-', label='%s - auPRC: %.3f'%(curve['method'], curve['auPRC'])) x_label = 'Recall' y_label = 'Precision' if flipped == 'flipped': x_label = 'Precision' y_label = 'Recall' plt.plot([curve['balance'], curve['balance']], [0,1], 'k--', lw=1) else: plt.plot([0, 1], [curve['balance'], curve['balance']], 'k--', lw=1) plt.xlim([0.0, 1.01]) plt.ylim([0.0, 1.01]) plt.xlabel(x_label) plt.ylabel(y_label) plt.title('PRC-curves') # adjust the subplots plt.subplots_adjust(top=0.9, bottom=0.1, left=0.07, right=0.95, hspace=0.3, wspace=0.25) # save the plot as pdf in folder "curves" plt.savefig('./curves/%s_%s_%s'%(flipped, ended, input_window_files.replace('.tsv', '.pdf'))) ###### Scripts for generating clusterogram #### #!/usr/bin/env python3 ''' Given the different datasets within the input file the distances are calculated and a clustermap is plotted Parameters ---------- input_window_files : str Path to the tab separated file containing the names of the datasets and the file paths to the files containing the genomic windows date: 2018-01-21 author: Steffen Albrecht example: python clusterogram.py NSC_H3K4Me3_FDR5.tsv example for the input file: Conv R1 ./data/NSC/FDR5/H3K4Me3/windows/1kb/conv_K4Me3_1_peaks.txt Conv R2 ./data/NSC/FDR5/H3K4Me3/windows/1kb/conv_K4Me3_2_peaks.txt TAF Tum R1 ./data/NSC/FDR5/H3K4Me3/windows/1kb/Tum_1k_K4Me3_1_peaks.txt ... ... ''' from sys import * import pandas as pd import seaborn as sns import util as util global input_window_files input_window_files = argv[1] # parse the input file for all datasets represented by genomic windows methods = [] datasets = {} with open(input_window_files, 'r') as f: for line in f: line = line.strip().split('\t') method = line[0] window_file_path = line[1] # read in the windows and convert into sets windows, min_enrich, max_enrich = util.read_windows(window_file_path) datasets[method] = set([window[0] for window in windows]) methods.append(method) # calculate the Jaccard Indices and use them as distance # as preprocessing for the clustermap data_dict = {} indexes = [] for method1 in methods: similarities = [] for method2 in methods: ji = 1.0 - util.jaccard_index(datasets[method1], datasets[method2]) similarities.append(ji) data_dict[method1] = similarities indexes.append(method1) # plotting the clustermap from seaborn with average linkage clustering cm_data = pd.DataFrame(data_dict, index = indexes) clustermap = sns.clustermap(cm_data, linewidth=2, method='average', figsize=(8,8), cmap='Blues_r', col_cluster=True) # the title is simply the name of the given file plot_title = '%s\n'%(input_window_files) clustermap.fig.suptitle(plot_title) clustermap.savefig('./%s'%(input_window_files.replace('.tsv', '.pdf')))