# The overall goal of this script is to analyse the real dataset # This script performs an advanced analysis dataDir=/Users/timothyh/home/courses/course_vc_2012_nb2_autumn_incorporateImprovements read1FqFile=${dataDir}/inputData/reads_agilentV1_chr5/real/real_agilentV1_chr5.R1.fastq read2FqFile=${dataDir}/inputData/reads_agilentV1_chr5/real/real_agilentV1_chr5.R2.fastq refFile=${dataDir}/inputData/human_g1k_v37_chr5/gatkBundle/human_g1k_v37_chr5.fasta knownVariants=${dataDir}/inputData/human_g1k_v37_chr5/gatkBundle/dbsnp_132.b37_chr5.vcf tiles=${dataDir}/inputData/human_g1k_v37_chr5/agilentV1/agilent37M.chr5.b37.bed ## Mapping using BWA bwa aln -t 3 $refFile $read1FqFile > aln_sa1.sai bwa aln -t 3 $refFile $read2FqFile > aln_sa2.sai bwa sampe $refFile aln_sa1.sai aln_sa2.sai $read1FqFile $read2FqFile > aln.sam ## Making sure all mate information is correct picard FixMateInformation.jar SORT_ORDER=coordinate INPUT=aln.sam OUTPUT=aln.posiSrt.bam CREATE_INDEX=true ## Adding meta data information picard AddOrReplaceReadGroups.jar \ CREATE_INDEX=true \ INPUT=aln.posiSrt.bam \ OUTPUT=aln.posiSrt.withRG.bam \ RGLB="agilentCapture" \ RGPL="Illumina" \ RGPU="unknown" \ RGSM="anonymousIndividual" \ RGCN="NSC" \ RGDS="Some_description_of_the_read_group" ## Checking that we have a valid BAM file picard ValidateSamFile.jar \ VALIDATION_STRINGENCY=STRICT \ MODE=VERBOSE \ IGNORE_WARNINGS=false \ REFERENCE_SEQUENCE=${refFile} \ VALIDATE_INDEX=true \ INPUT=aln.posiSrt.withRG.bam ## Identifying where re-alignment is required and then performing re-alignment gatk -T RealignerTargetCreator \ -R $refFile \ -o realignment.intervals \ -I aln.posiSrt.withRG.bam \ --known $knownVariants gatk -T IndelRealigner \ -I aln.posiSrt.withRG.bam \ -R $refFile \ -targetIntervals realignment.intervals \ -o aln.posiSrt.withRG.clean.bam \ -compress 0 \ --disable_bam_indexing ## Marking duplicates picard MarkDuplicates.jar \ INPUT=aln.posiSrt.withRG.clean.bam \ OUTPUT=aln.posiSrt.withRG.clean.dedup.bam \ METRICS_FILE=aln.posiSrt.withRG.clean.dedup.bam_metrics.duplication.txt \ CREATE_INDEX=true \ REMOVE_DUPLICATES=false \ COMMENT="MarkDuplicates.jar_run_on_cleaned_alignments" ../beautifyDuplicationMetricsReport.bash aln.posiSrt.withRG.clean.dedup.bam_metrics.duplication.txt > aln.posiSrt.withRG.clean.dedup.bam_metrics.duplication.beautified.txt ## Base quality score recalibration gatk CountCovariates \ -l INFO \ -R $refFile \ -knownSites $knownVariants \ -I aln.posiSrt.withRG.clean.dedup.bam \ -cov ReadGroupCovariate \ -cov QualityScoreCovariate \ -cov DinucCovariate \ -cov CycleCovariate \ -recalFile table.recal_data.pre.csv gatk TableRecalibration \ -l INFO \ -R $refFile \ -I aln.posiSrt.withRG.clean.dedup.bam \ -recalFile table.recal_data.pre.csv \ --out aln.posiSrt.withRG.clean.dedup.recal.bam \ -OQ \ -pQ 5 \ --disable_bam_indexing samtools index aln.posiSrt.withRG.clean.dedup.recal.bam ## validation of BAM file picard ValidateSamFile.jar \ VALIDATION_STRINGENCY=STRICT \ MODE=VERBOSE \ IGNORE_WARNINGS=false \ REFERENCE_SEQUENCE=${refFile} \ VALIDATE_INDEX=true \ INPUT=aln.posiSrt.withRG.clean.dedup.recal.bam # define a variable to hold the name of this BAM file bamFile=aln.posiSrt.withRG.clean.dedup.recal.bam ## alignment metrics picard CollectAlignmentSummaryMetrics.jar \ INPUT=${bamFile} \ OUTPUT=${bamFile}_metrics.alignment.txt \ REFERENCE_SEQUENCE=$refFile ../beautifyAlignmentMetricsReport.bash ${bamFile}_metrics.alignment.txt > ${bamFile}_metrics.alignment.beautified.txt ## Insert size metrics picard CollectInsertSizeMetrics.jar \ HISTOGRAM_FILE=${bamFile}_metrics.insert.pdf \ INPUT=${bamFile} \ OUTPUT=${bamFile}_metrics.insert.txt ../beautifyInsertMetricsReport.bash ${bamFile}_metrics.insert.txt > ${bamFile}_metrics.insert.beautified.txt ## Coverage metrics picard CalculateHsMetrics.jar \ BAIT_INTERVALS=${dataDir}/inputData/human_g1k_v37_chr5/agilentV1/agilent37M.chr5.b37.txt \ TARGET_INTERVALS=${dataDir}/inputData/human_g1k_v37_chr5/agilentV1/agilent37M.chr5.b37.txt \ INPUT=${bamFile} OUTPUT=${bamFile}_metrics.coverage.txt \ REFERENCE_SEQUENCE=${refFile} \ PER_TARGET_COVERAGE=${bamFile}_metrics.coveragePerTarget.txt ../beautifyCoverageMetricsReport.bash ${bamFile}_metrics.coverage.txt > ${bamFile}_metrics.coverage.beautified.txt function compressAndIndex(){ igvtools index $1 bgzip -c $1 > $1.gz tabix -p vcf $1.gz } ## Variant calling snps=snpsHQ.vcf indels=indelsHQ.vcf gatk UnifiedGenotyper \ --validation_strictness STRICT \ -R $refFile \ -I $bamFile \ -o $snps \ --dbsnp $knownVariants gatk UnifiedGenotyper \ --validation_strictness STRICT \ -R $refFile \ -glm INDEL \ -I $bamFile \ -o $indels \ --dbsnp $knownVariants compressAndIndex $snps compressAndIndex $indels ## Variant filtration snpsFiltered=snpsHQ.filt.vcf indelsFiltered=indelsHQ.filt.vcf gatk VariantFiltration \ -R $refFile \ --variant $snps \ -o $snpsFiltered \ --filterExpression "QD<2.0" \ --filterExpression "MQ<40.0" \ --filterExpression "FS>60.0" \ --filterName "QDFilter" \ --filterName "MQFilter" \ --filterName "FSFilter" gatk VariantFiltration \ -R $refFile \ --variant $indels \ -o $indelsFiltered \ --filterExpression "QD<2.0" \ --filterExpression "FS>200.0" \ --filterName "QDFilter" \ --filterName "FSFilter" compressAndIndex $snpsFiltered compressAndIndex $indelsFiltered # Functional annotation vcfInput="snpsHQ.filt.vcf" # vcf file to be annotated by snpEff outputDir=. #### Variables that may be set by user but probably should not be # In particular, snpEffDb should not be changed as other versions are know to give erroneous results (see background) snpEffDb=GRCh37.64 # define the version of the Ensembl database to be used by snpEff, location of file is specified in snpEff's config file dbNSFP=/Users/timothyh/home/binHTS/snpSift_1.7/dbNSFP2.0b3.txt # download from snpSift website ##### Output files >> should not require any changes to code beyond this point function filenameWoExt(){ basename $1 | sed 's/\(.*\)\.[a-z]*/\1/' } vcfAllImpacts=`filenameWoExt $vcfInput`.$snpEffDb.vcf # output by snpEff vcfTopImpact=`filenameWoExt $vcfAllImpacts`.topImpact.vcf # output by gatk vcfNSFP=`filenameWoExt $vcfTopImpact`.NSFP.vcf # output by snpSift # Annotate all effects given ensembl db using snpEff cd $outputDir snpEff2.0.5 eff -v -onlyCoding true -i vcf -o vcf $snpEffDb $vcfInput > $vcfAllImpacts # Here we pick out the top impact annotation for each variant # There are alternatives here e.g. picking out the transcript that is in CCDS or in REFSEQ gatk2.1 VariantAnnotator \ -R $refFile \ -A SnpEff \ --variant $vcfInput \ --snpEffFile $vcfAllImpacts \ -L $vcfInput \ -o $vcfTopImpact # Add detailed information for non-synomymous snps from the dbNSFP db using snpSift snpSift1.7 dbnsfp -v $dbNSFP $vcfTopImpact > $vcfNSFP # Transform into a table for easier filtering gatk2.1 VariantsToTable \ -R $refFile \ -V $vcfNSFP \ -o ${vcfNSFP}_asTable \ --allowMissingData \ --showFiltered \ -F CHROM \ -F POS \ -F ID \ -F REF \ -F ALT \ -F QUAL \ -F FILTER \ -GF GT \ -GF GQ \ -F HET \ -F HOM-REF \ -F HOM-VAR \ -F TYPE \ -F VAR \ -F SNPEFF_EFFECT \ -F SNPEFF_IMPACT \ -F SNPEFF_FUNCTIONAL_CLASS \ -F SNPEFF_CODON_CHANGE \ -F SNPEFF_AMINO_ACID_CHANGE \ -F SNPEFF_GENE_NAME \ -F SNPEFF_GENE_BIOTYPE \ -F SNPEFF_TRANSCRIPT_ID \ -F SNPEFF_EXON_ID \ -F dbnsfpUniprot_acc \ -F dbnsfpInterpro_domain \ -F dbnsfpEnsembl_transcriptid \ -F dbnsfpSIFT_score \ -F dbnsfpPolyphen2_HVAR_pred \ -F dbnsfpGERP++_NR \ -F dbnsfpGERP++_RS \ -F dbnsfp29way_logOdds \ -F dbnsfp1000Gp1_AF \ -F dbnsfp1000Gp1_AFR_AF \ -F dbnsfp1000Gp1_EUR_AF \ -F dbnsfp1000Gp1_AMR_AF \ -F dbnsfp1000Gp1_ASN_AF \ -F dbnsfpESP5400_AA_AF \ -F dbnsfpESP5400_EA_AF