Source code for mgkit.workflow.snp_parser

"""
This script parses results of SNPs analysis from any tool for SNP calling [#]_
and integrates them into a format that can be later used for other scripts in
the pipeline.

It integrates coverage and expected number of syn/nonsyn change and taxonomy
from a GFF file, SNP data from a VCF file.

.. note::

    The script accept gzipped VCF files

.. [#] GATK pipeline was tested, but it is possible to use samtools and
    bcftools

Changes
*******

.. versionchanged:: 0.2.1
    added *-s* option for VCF files generated using bcftools

.. versionchanged:: 0.1.16
    reworkked internals and removed SNPDat, syn/nonsyn evaluation is internal

.. versionchanged:: 0.1.13
    reworked the internals and the classes used, including options -m and -s


"""

from __future__ import division
from builtins import zip
import HTSeq
import logging
import argparse
from . import utils
from ..io import gff, compressed_handle, fasta
from .. import logger
from ..snps.classes import GeneSNP, SNPType
import pickle


LOG = logging.getLogger(__name__)


[docs]def set_parser(): """ Sets command line arguments parser """ parser = argparse.ArgumentParser( description='SNPs analysis, requires a vcf file and SNPDat results', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( '-o', '--output-file', default='snp_data.pickle', type=argparse.FileType('wb'), help='Ouput file' ) parser.add_argument( '-q', '--min-qual', default=30, type=int, action='store', help='Minimum SNP quality (Phred score)' ) parser.add_argument( '-f', '--min-freq', default=0.01, type=float, action='store', help='Minimum allele frequency' ) parser.add_argument( '-r', '--min-reads', default=4, type=int, action='store', help='Minimum number of reads to accept the SNP' ) parser.add_argument( '-g', '--gff-file', required=True, type=argparse.FileType('rb'), action='store', help='GFF file with annotations' ) parser.add_argument( '-p', '--vcf-file', required=True, type=argparse.FileType('r'), action='store', help='Merged VCF file' ) parser.add_argument( '-a', '--reference', required=True, type=argparse.FileType('rb'), help='Fasta file with the GFF Reference' ) parser.add_argument( '-m', '--samples-id', action='append', required=True, type=str, help='the ids of the samples used in the analysis' ) parser.add_argument( '-c', '--cov-suff', action='store', default='_cov', help="Per sample coverage suffix in the GFF" ) parser.add_argument( '-s', '--bcftools-vcf', action='store_true', default=False, help="bcftools call was used to produce the VCF file" ) utils.add_basic_options(parser, manual=__doc__) return parser
[docs]def init_count_set(annotations): LOG.info("Init data structures") samples = list(annotations[0].sample_coverage.keys()) snp_data = dict( (sample, {}) for sample in samples ) for annotation in annotations: taxon_id = annotation.taxon_id uid = annotation.uid sample_coverage = annotation.sample_coverage for sample in sample_coverage: snp_data[sample][uid] = GeneSNP( uid=uid, gene_id=annotation.gene_id, taxon_id=taxon_id, exp_syn=annotation.exp_syn, exp_nonsyn=annotation.exp_nonsyn, coverage=sample_coverage[sample], ) return snp_data
[docs]def check_snp_in_set(samples, snp_data, pos, change, annotations, seq): """ Used by :func:`parse_vcf` to check if a SNP :param iterable samples: list of samples that contain the SNP :param dict snp_data: dictionary from :func:`init_count_set` with per sample SNPs information """ for annotation in annotations: if pos not in annotation: continue if annotation.is_syn(seq, pos, change, strict=False): snp_type = SNPType.syn else: snp_type = SNPType.nonsyn uid = annotation.uid rel_pos = annotation.get_relative_pos(pos) for sample in samples: snp_data[sample][uid].add_snp(rel_pos, change, snp_type=snp_type)
[docs]def parse_vcf(vcf_file, snp_data, min_reads, min_af, min_qual, annotations, seqs, options, line_num=100000): """ Parse VCF file counts synonymous and non-synonymous SNPs :param file vcf_file: file handle to a VCF file :param dict snp_data: dictionary from :func:`init_count_set` with per sample SNPs information :param int min_reads: minimum number of reads to accept a SNP :param float min_af: minimum allele frequency to accept a SNP :param int min_qual: minimum quality (Phred score) to accept a SNP :param dict annotations: annotations grouped by their reference sequence :param dict seqs: reference sequences :param int line_num: the interval in number of lines at which progress will be printed """ vcf_handle = HTSeq.VCF_Reader(compressed_handle(vcf_file)) vcf_handle.parse_meta() vcf_handle.make_info_dict() # total number of SNPs accepted count_tot = 0 # number of SNPs skipped for low depth skip_dp = 0 # number of SNPs skipped for low allele frequency skip_af = 0 # number of SNPs skipped for low quality skip_qual = 0 # indels skip_indels = 0 for vcf_record in vcf_handle: # the SNP is a sequence with no annotations if vcf_record.chrom not in annotations: continue if float(vcf_record.qual) < min_qual: # low quality SNP skip_qual += 1 continue # unpack info records (needed for vcf_record.info to be a dictionary) vcf_record.unpack_info(vcf_handle.infodict) if vcf_record.info['INDEL']: skip_indels += 1 continue if not isinstance(vcf_record.info['DP'], int): LOG.warning(vcf_record.info['DP']) if vcf_record.info['DP'] < min_reads: # not enough reads (depth) for the SNP skip_dp += 1 continue # Samtools mpileup -> bcftools call doesn't output the allele freq. # it can be calculated with AC/AN for each ALT nucleotide # checked on bfctools (roh command) manual # https://samtools.github.io/bcftools/bcftools.html try: allele_freqs = vcf_record.info['AF'] except KeyError: if isinstance(vcf_record.info['AC'], list): allele_freqs = [ AC / vcf_record.info['AN'] for AC in vcf_record.info['AC'] ] else: allele_freqs = vcf_record.info['AC'] / vcf_record.info['AN'] # if the allele frequency is a single value, make it a list, so # the iteration below works anyway if isinstance(allele_freqs, float): allele_freqs = [allele_freqs] # alt is the nucleotidic change iter_data = zip(allele_freqs, vcf_record.alt) for alt_index, (allele_freq, change) in enumerate(iter_data): if allele_freq < min_af: # the allele frequency for the SNP is too low, it'll be # skipped skip_af += 1 continue # the samples that contain the SNP is a string separated by '-' if options.bcftools_vcf: samples = set() for sample_id, sample_info in vcf_record.samples.items(): # prepare the genotype list, to make the comparison easier # the genotype separator to '/' only, to use only one # type of split sample_info_gt = sample_info['GT'].replace('|', '/') sample_info_gt = sample_info_gt.split('/') for genotype in sample_info_gt: if genotype == '.': continue if int(genotype) == (alt_index + 1): samples.add(sample_id) else: samples = [ sample for sample in vcf_record.info['set'].split('-') ] check_snp_in_set( samples, snp_data, vcf_record.pos.start, change, annotations[vcf_record.chrom], seqs[vcf_record.chrom] ) # increase the total number of snps available count_tot += 1 if vcf_handle.line_no % line_num == 0: LOG.info( "Line %d, SNPs passed %d; skipped for: qual %d, " + "depth %d, freq %d, indels %d", vcf_handle.line_no, count_tot, skip_qual, skip_dp, skip_af, skip_indels )
[docs]def save_data(output_file, snp_data): """ Pickle data structures to the disk. :param str output_file: base name for pickle files :param dict snp_data: dictionary from :func:`init_count_set` with per sample SNPs information """ LOG.info("Saving sample SNPs to %s", output_file) pickle.dump(snp_data, output_file, -1)
[docs]def main(): "Main function" options = set_parser().parse_args() # configs log and set log level logger.config_log(options.verbose) seqs = dict(fasta.load_fasta(options.reference)) # Loads them as list because it's easier to init the data structure annotations = list(gff.parse_gff(options.gff_file)) if len(annotations[0].sample_coverage) != len(options.samples_id): utils.exit_script( "Coverage information was not found for all samples", 2 ) snp_data = init_count_set(annotations) # Group annotations by their reference sequence annotations = gff.group_annotations( annotations, key_func=lambda x: x.seq_id ) parse_vcf( options.vcf_file, snp_data, options.min_reads, options.min_freq, options.min_qual, annotations, seqs, options ) save_data(options.output_file, snp_data)
if __name__ == '__main__': main()