# The overall goal of this script is to analyse the real dataset
# This script performs an advanced analysis

dataDir=/Users/timothyh/home/courses/course_vc_2012_nb2_autumn_incorporateImprovements
read1FqFile=${dataDir}/inputData/reads_agilentV1_chr5/real/real_agilentV1_chr5.R1.fastq
read2FqFile=${dataDir}/inputData/reads_agilentV1_chr5/real/real_agilentV1_chr5.R2.fastq
refFile=${dataDir}/inputData/human_g1k_v37_chr5/gatkBundle/human_g1k_v37_chr5.fasta
knownVariants=${dataDir}/inputData/human_g1k_v37_chr5/gatkBundle/dbsnp_132.b37_chr5.vcf
tiles=${dataDir}/inputData/human_g1k_v37_chr5/agilentV1/agilent37M.chr5.b37.bed

## Mapping using BWA

bwa aln -t 3 $refFile $read1FqFile > aln_sa1.sai
bwa aln -t 3 $refFile $read2FqFile > aln_sa2.sai
bwa sampe $refFile aln_sa1.sai aln_sa2.sai $read1FqFile $read2FqFile > aln.sam

## Making sure all mate information is correct

picard FixMateInformation.jar SORT_ORDER=coordinate INPUT=aln.sam OUTPUT=aln.posiSrt.bam CREATE_INDEX=true

## Adding meta data information

picard AddOrReplaceReadGroups.jar \
CREATE_INDEX=true \
INPUT=aln.posiSrt.bam \
OUTPUT=aln.posiSrt.withRG.bam \
RGLB="agilentCapture" \
RGPL="Illumina" \
RGPU="unknown" \
RGSM="anonymousIndividual" \
RGCN="NSC" \
RGDS="Some_description_of_the_read_group"

## Checking that we have a valid BAM file

picard ValidateSamFile.jar \
VALIDATION_STRINGENCY=STRICT \
MODE=VERBOSE \
IGNORE_WARNINGS=false \
REFERENCE_SEQUENCE=${refFile} \
VALIDATE_INDEX=true \
INPUT=aln.posiSrt.withRG.bam

## Identifying where re-alignment is required and then performing re-alignment

gatk -T RealignerTargetCreator \
-R $refFile \
-o realignment.intervals \
-I aln.posiSrt.withRG.bam \
--known $knownVariants

gatk -T IndelRealigner \
-I aln.posiSrt.withRG.bam \
-R $refFile \
-targetIntervals realignment.intervals \
-o aln.posiSrt.withRG.clean.bam \
-compress 0 \
--disable_bam_indexing

## Marking duplicates

picard MarkDuplicates.jar \
INPUT=aln.posiSrt.withRG.clean.bam \
OUTPUT=aln.posiSrt.withRG.clean.dedup.bam \
METRICS_FILE=aln.posiSrt.withRG.clean.dedup.bam_metrics.duplication.txt \
CREATE_INDEX=true \
REMOVE_DUPLICATES=false \
COMMENT="MarkDuplicates.jar_run_on_cleaned_alignments"

../beautifyDuplicationMetricsReport.bash aln.posiSrt.withRG.clean.dedup.bam_metrics.duplication.txt > aln.posiSrt.withRG.clean.dedup.bam_metrics.duplication.beautified.txt

## Base quality score recalibration

gatk CountCovariates \
-l INFO \
-R $refFile \
-knownSites $knownVariants \
-I aln.posiSrt.withRG.clean.dedup.bam \
-cov ReadGroupCovariate \
-cov QualityScoreCovariate \
-cov DinucCovariate \
-cov CycleCovariate \
-recalFile table.recal_data.pre.csv

gatk TableRecalibration \
-l INFO \
-R $refFile \
-I aln.posiSrt.withRG.clean.dedup.bam \
-recalFile table.recal_data.pre.csv \
--out aln.posiSrt.withRG.clean.dedup.recal.bam \
-OQ \
-pQ 5 \
--disable_bam_indexing

samtools index aln.posiSrt.withRG.clean.dedup.recal.bam

## validation of BAM file

picard ValidateSamFile.jar \
VALIDATION_STRINGENCY=STRICT \
MODE=VERBOSE \
IGNORE_WARNINGS=false \
REFERENCE_SEQUENCE=${refFile} \
VALIDATE_INDEX=true \
INPUT=aln.posiSrt.withRG.clean.dedup.recal.bam

# define a variable to hold the name of this BAM file
bamFile=aln.posiSrt.withRG.clean.dedup.recal.bam

## alignment metrics

picard CollectAlignmentSummaryMetrics.jar \
INPUT=${bamFile} \
OUTPUT=${bamFile}_metrics.alignment.txt \
REFERENCE_SEQUENCE=$refFile

../beautifyAlignmentMetricsReport.bash ${bamFile}_metrics.alignment.txt > ${bamFile}_metrics.alignment.beautified.txt

## Insert size metrics

picard CollectInsertSizeMetrics.jar \
HISTOGRAM_FILE=${bamFile}_metrics.insert.pdf \
INPUT=${bamFile} \
OUTPUT=${bamFile}_metrics.insert.txt

../beautifyInsertMetricsReport.bash ${bamFile}_metrics.insert.txt > ${bamFile}_metrics.insert.beautified.txt

## Coverage metrics

picard CalculateHsMetrics.jar \
BAIT_INTERVALS=${dataDir}/inputData/human_g1k_v37_chr5/agilentV1/agilent37M.chr5.b37.txt \
TARGET_INTERVALS=${dataDir}/inputData/human_g1k_v37_chr5/agilentV1/agilent37M.chr5.b37.txt \
INPUT=${bamFile} OUTPUT=${bamFile}_metrics.coverage.txt \
REFERENCE_SEQUENCE=${refFile} \
PER_TARGET_COVERAGE=${bamFile}_metrics.coveragePerTarget.txt

../beautifyCoverageMetricsReport.bash ${bamFile}_metrics.coverage.txt > ${bamFile}_metrics.coverage.beautified.txt 

function compressAndIndex(){
igvtools index $1
bgzip -c $1 > $1.gz
tabix -p vcf $1.gz
}

## Variant calling

snps=snpsHQ.vcf
indels=indelsHQ.vcf

gatk UnifiedGenotyper \
--validation_strictness STRICT \
-R $refFile \
-I $bamFile \
-o $snps \
--dbsnp $knownVariants

gatk UnifiedGenotyper \
--validation_strictness STRICT \
-R $refFile \
-glm INDEL \
-I $bamFile \
-o $indels \
--dbsnp $knownVariants 

compressAndIndex $snps
compressAndIndex $indels

## Variant filtration

snpsFiltered=snpsHQ.filt.vcf
indelsFiltered=indelsHQ.filt.vcf

gatk VariantFiltration \
-R $refFile \
--variant $snps \
-o $snpsFiltered \
--filterExpression "QD<2.0" \
--filterExpression "MQ<40.0" \
--filterExpression "FS>60.0" \
--filterName "QDFilter" \
--filterName "MQFilter" \
--filterName "FSFilter"

gatk VariantFiltration \
-R $refFile \
--variant $indels \
-o $indelsFiltered \
--filterExpression "QD<2.0" \
--filterExpression "FS>200.0" \
--filterName "QDFilter" \
--filterName "FSFilter"

compressAndIndex $snpsFiltered
compressAndIndex $indelsFiltered


# Functional annotation

vcfInput="snpsHQ.filt.vcf" # vcf file to be annotated by snpEff
outputDir=.


#### Variables that may be set by user but probably should not be
# In particular, snpEffDb should not be changed as other versions are know to give erroneous results (see background)
snpEffDb=GRCh37.64 # define the version of the Ensembl database to be used by snpEff, location of file is specified in snpEff's config file
dbNSFP=/Users/timothyh/home/binHTS/snpSift_1.7/dbNSFP2.0b3.txt # download from snpSift website

##### Output files >> should not require any changes to code beyond this point
function filenameWoExt(){
basename $1 | sed 's/\(.*\)\.[a-z]*/\1/'
}
vcfAllImpacts=`filenameWoExt $vcfInput`.$snpEffDb.vcf # output by snpEff
vcfTopImpact=`filenameWoExt $vcfAllImpacts`.topImpact.vcf # output by gatk
vcfNSFP=`filenameWoExt $vcfTopImpact`.NSFP.vcf # output by snpSift

# Annotate all effects given ensembl db using snpEff
cd $outputDir
snpEff2.0.5 eff -v -onlyCoding true -i vcf -o vcf $snpEffDb $vcfInput > $vcfAllImpacts


# Here we pick out the top impact annotation for each variant
# There are alternatives here e.g. picking out the transcript that is in CCDS or in REFSEQ
gatk2.1 VariantAnnotator \
-R $refFile \
-A SnpEff \
--variant $vcfInput \
--snpEffFile $vcfAllImpacts \
-L $vcfInput \
-o $vcfTopImpact

# Add detailed information for non-synomymous snps from the dbNSFP db using snpSift
snpSift1.7 dbnsfp -v $dbNSFP $vcfTopImpact > $vcfNSFP

# Transform into a table for easier filtering
gatk2.1 VariantsToTable \
-R $refFile \
-V $vcfNSFP \
-o ${vcfNSFP}_asTable \
--allowMissingData \
--showFiltered \
-F CHROM \
-F POS \
-F ID \
-F REF \
-F ALT \
-F QUAL \
-F FILTER \
-GF GT \
-GF GQ \
-F HET \
-F HOM-REF \
-F HOM-VAR \
-F TYPE \
-F VAR \
-F SNPEFF_EFFECT \
-F SNPEFF_IMPACT \
-F SNPEFF_FUNCTIONAL_CLASS \
-F SNPEFF_CODON_CHANGE \
-F SNPEFF_AMINO_ACID_CHANGE \
-F SNPEFF_GENE_NAME \
-F SNPEFF_GENE_BIOTYPE \
-F SNPEFF_TRANSCRIPT_ID \
-F SNPEFF_EXON_ID \
-F dbnsfpUniprot_acc \
-F dbnsfpInterpro_domain \
-F dbnsfpEnsembl_transcriptid \
-F dbnsfpSIFT_score \
-F dbnsfpPolyphen2_HVAR_pred \
-F dbnsfpGERP++_NR \
-F dbnsfpGERP++_RS \
-F dbnsfp29way_logOdds \
-F dbnsfp1000Gp1_AF \
-F dbnsfp1000Gp1_AFR_AF \
-F dbnsfp1000Gp1_EUR_AF \
-F dbnsfp1000Gp1_AMR_AF \
-F dbnsfp1000Gp1_ASN_AF \
-F dbnsfpESP5400_AA_AF \
-F dbnsfpESP5400_EA_AF