Precision Medicine Bioinformatics

Introduction to bioinformatics for DNA and RNA sequence analysis

Post-Alignment QC

Calculate QC metric for bam files using samtools and picard See docs here: https://github.com/genome/cancer-genomics-workflow/wiki/Alignment

Run Samtools flagstat

cd /workspace/align/
samtools flagstat /workspace/align/Exome_Norm_sorted_mrkdup_bqsr.bam > /workspace/align/Exome_Norm_flagstat.txt
samtools flagstat /workspace/align/Exome_Tumor_sorted_mrkdup_bqsr.bam > /workspace/align/Exome_Tumor_flagstat.txt
# Runtime: < 2min
samtools flagstat /workspace/align/WGS_Norm_merged_sorted_mrkdup_bqsr.bam > /workspace/align/WGS_Norm_merged_flagstat.txt
samtools flagstat /workspace/align/WGS_Tumor_merged_sorted_mrkdup_bqsr.bam > /workspace/align/WGS_Tumor_merged_flagstat.txt

Run various picard CollectMetrics tools

cd /workspace/align/
java -Xmx24g -jar $PICARD CollectInsertSizeMetrics I=/workspace/align/Exome_Norm_sorted_mrkdup_bqsr.bam O=/workspace/align/Exome_Norm_insert_size_metrics.txt H=/workspace/align/Exome_Norm_insert_size_metrics.pdf
java -Xmx24g -jar $PICARD CollectInsertSizeMetrics I=/workspace/align/Exome_Tumor_sorted_mrkdup_bqsr.bam O=/workspace/align/Exome_Tumor_insert_size_metrics.txt H=/workspace/align/Exome_Tumor_insert_size_metrics.pdf
java -Xmx24g -jar $PICARD CollectInsertSizeMetrics I=/workspace/align/WGS_Norm_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Norm_merged_insert_size_metrics.txt H=/workspace/align/WGS_Norm_merged_insert_size_metrics.pdf
java -Xmx24g -jar $PICARD CollectInsertSizeMetrics I=/workspace/align/WGS_Tumor_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Tumor_merged_insert_size_metrics.txt H=/workspace/align/WGS_Tumor_merged_insert_size_metrics.pdf

# Picard CollectAlignmentSummaryMetrics
java -Xmx24g -jar $PICARD CollectAlignmentSummaryMetrics I=/workspace/align/Exome_Norm_sorted_mrkdup_bqsr.bam O=/workspace/align/Exome_Norm_alignment_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa
java -Xmx24g -jar $PICARD CollectAlignmentSummaryMetrics I=/workspace/align/Exome_Tumor_sorted_mrkdup_bqsr.bam O=/workspace/align/Exome_Tumor_exome_alignment_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa
java -Xmx24g -jar $PICARD CollectAlignmentSummaryMetrics I=/workspace/align/WGS_Norm_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Norm_merged_alignment_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa
java -Xmx24g -jar $PICARD CollectAlignmentSummaryMetrics I=/workspace/align/WGS_Tumor_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Tumor_merged_alignment_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa

#Picard CollectHsMetrics
#Exome Only
java -Xmx24g -jar $PICARD CollectHsMetrics I=/workspace/align/Exome_Norm_sorted_mrkdup_bqsr.bam O=/workspace/align/Exome_Norm_hs_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa BI=/workspace/inputs/references/exome/probe_regions.bed.interval_list TI=/workspace/inputs/references/exome/exome_regions.bed.interval_list
java -Xmx24g -jar $PICARD CollectHsMetrics I=/workspace/align/Exome_Tumor_sorted_mrkdup_bqsr.bam O=/workspace/align/Exome_Tumor_hs_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa BI=/workspace/inputs/references/exome/probe_regions.bed.interval_list TI=/workspace/inputs/references/exome/exome_regions.bed.interval_list

#Picard CollectGcBiasMetrics
#WGS only
java -Xmx24g -jar $PICARD CollectGcBiasMetrics I=/workspace/align/WGS_Norm_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Norm_merged_gc_bias_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa CHART=/workspace/align/WGS_Norm_merged_gc_bias_metrics.pdf S=/workspace/align/WGS_Norm_merged_gc_bias_summary.txt
java -Xmx24g -jar $PICARD CollectGcBiasMetrics I=/workspace/align/WGS_Tumor_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Tumor_merged_gc_bias_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa CHART=/workspace/align/WGS_Tumor_merged_gc_bias_metrics.pdf S=/workspace/align/WGS_Tumor_merged_gc_bias_summary.txt

#Picard CollectWgsMetrics

#First we need to create the Autosomal Chromosome Interval List
egrep 'chr[0-9]{1,2}\s' /workspace/inputs/references/genome/ref_genome.fa.fai | awk '{print $1"\t1\t"$2"\t+\t"$1}' | cat /workspace/inputs/references/genome/ref_genome.dict - > /workspace/inputs/references/genome/ref_genome_autosomal.interval_list

java -Xmx24g -jar $PICARD CollectWgsMetrics I=/workspace/align/WGS_Norm_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Norm_merged_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa INTERVALS=/workspace/inputs/references/genome/ref_genome_autosomal.interval_list
java -Xmx24g -jar $PICARD CollectWgsMetrics I=/workspace/align/WGS_Tumor_merged_sorted_mrkdup_bqsr.bam O=/workspace/align/WGS_Tumor_merged_metrics.txt R=/workspace/inputs/references/genome/ref_genome.fa INTERVALS=/workspace/inputs/references/genome/ref_genome_autosomal.interval_list

Run FastQC

cd /workspace/align

fastqc -t 8 Exome_Norm_sorted_mrkdup_bqsr.bam
fastqc -t 8 Exome_Tumor_sorted_mrkdup_bqsr.bam
tree

fastqc -t 8 WGS_Norm_merged_sorted_mrkdup_bqsr.bam
fastqc -t 8 WGS_Tumor_merged_sorted_mrkdup_bqsr.bam
tree

#fastqc RNAseq_Norm
#fastqc RNAseq_Tumor
#tree

Run MultiQC to produce a final report

cd /workspace/align
mkdir post_align_qc
cd post_align_qc
multiqc /workspace/align/
tree

Here is what the multiqc report should look like, we can view the webpage by loading the html file generated by multiqc: