Source code for mgkit.io.fastq

"""
Fastq utility functions
"""
import re
import numpy
import logging

from .utils import open_file

LOG = logging.getLogger(__name__)

CASAVA_HEADER_OLD = r"""(?P<machine>\w+-\w+):
        (?P<lane>\d):
        (?P<tile>\d+):
        (?P<xcoord>\d+):
        (?P<ycoord>\d+)\#
        (?P<index>(\d|[ACTGN]{6}))/
        (?P<mate>(1|2))"""
"Old casava header regex"

CASAVA_HEADER_NEW = r"""(?P<machine>[\w-]+):
        (?P<runid>\d+):
        (?P<cellid>\w+):
        (?P<lane>\d):
        (?P<tile>\d+):
        (?P<xcoord>\d+):
        (?P<ycoord>\d+)
        [_ ](?P<mate>\d): # underscore for data from from www.ebi.ac.uk/ena/
        (?P<filter>[YN]):
        (?P<bits>\d+):
        (?P<index>[ACTGN+]+)"""
"New casava header regex, including indices for both forward and reverse"

CASAVA_KHMER = r"""
    (?P<machine>[\w-]+):
    (?P<runid>\d+):
    (?P<cellid>\w+):
    (?P<lane>\d):
    (?P<tile>\d+):
    (?P<xcoord>\d+):
    (?P<ycoord>\d+)
    /(?P<mate>\d)
"""


[docs]def check_fastq_type(qualities):
    """
    Trys to guess the type of quality string used in a Fastq file

    :param str qualities: string with the quality scores as in the Fastq file
    :return str: a string with the guessed quality score

    .. note::

        Possible values are the following, classified but the values usually
        used in other softwares:

        * ASCII33: sanger, illumina-1.8
        * ASCII64: illumina-1.3, illumina-1.5, solexa-old

    """
    qualities = [ord(char) for char in qualities]

    max_qual = max(qualities)
    min_qual = min(qualities)

    if (min_qual >= 33) and (max_qual <= 73):
        return 'sanger'
    elif (min_qual >= 33) and (max_qual <= 74):
        return 'illumina-1.8'
    elif (min_qual >= 64) and (max_qual <= 104):
        return 'illumina-1.5'
    elif (min_qual >= 67) and (max_qual <= 104):
        return 'illumina-1.3'
    elif (min_qual >= 59) and (max_qual <= 104):
        return 'solexa-old'


[docs]def convert_seqid_to_new(seq_id):
    """
    Convert old seq_id format for Illumina reads to the new found in Casava
    1.8+

    :param str seq_id: seq_id of the sequence (stripped of '@')
    :return str: the new format seq_id

    .. note::

        Example from Wikipedia::

            old casava seq_id:
            @HWUSI-EAS100R:6:73:941:1973#0/1
            new casava seq_id:
            @EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCAC

    """
    machine_id, cell_lane, tile_num, x_coord, other = seq_id.split(':')
    y_coord, other = other.split('#')
    idx_multiplex, mate = other.split('/')

    return "{0}:0:FC706VJ:{1}:{2}:{3}:{4} {5}:N:0:{6}".format(
        machine_id,
        cell_lane,
        tile_num,
        x_coord,
        y_coord,
        mate,
        idx_multiplex
    )


[docs]def convert_seqid_to_old(seq_id, index_as_seq=True):
    """

    .. deprecated:: 0.3.3

    Convert old seq_id format for Illumina reads to the new found in Casava
    until 1.8, which marks the new format.

    :param str seq_id: seq_id of the sequence (stripped of '@')
    :param bool index_as_seq: if True, the index for the multiplex we'll be
        the sequence found at the end of the new format seq_id. Otherwise, 0
        we'll be used
    :return str: the new format seq_id
    """
    left_part, right_part = seq_id.split(' ')
    machine_id, run_id, cell_id, cell_lane, tile_num, x_coord, \
        y_coord = left_part.split(':')

    mate, fail_filter, control_bits, index_seq = right_part.split(':')

    return "HWUSI-{0}:{1}:{2}:{3}:{4}#{5}/{6}".format(
        machine_id,
        cell_lane,
        tile_num,
        x_coord,
        y_coord,
        index_seq if index_as_seq else '0',
        mate
    )


[docs]def write_fastq_sequence(file_handle, name, seq, qual, write_mode='a'):
    """

    .. versionchanged:: 0.3.3
        if *qual* is not a string it's converted to chars (phred33)

    Write a fastq sequence to file. If the *file_handle* is a string, the file
    will be opened using *write_mode*.

    :param file_handle: file handle or string.
    :param str name: header to write for the sequence
    :param str seq: sequence to write
    :param str qual: quality string
    """
    file_handle = open_file(file_handle, write_mode)

    if isinstance(qual[0], (int, numpy.integer)):
        qual = ''.join(chr(q + 33) for q in qual)

    file_handle.write(
        "@{name}\n{seq}\n+\n{qual}\n".format(
            name=name,
            seq=seq,
            qual=qual
        ).encode('ascii')
    )


[docs]def choose_header_type(seq_id):
    """
    Return the guessed compiled regular expression
    :param str seq_id: sequence header to test

    :return: compiled regular expression object or None if no match found
    """
    if re.search(CASAVA_HEADER_OLD, seq_id, re.X) is not None:
        return re.compile(CASAVA_HEADER_OLD, re.X)
    if re.search(CASAVA_HEADER_NEW, seq_id, re.X) is not None:
        return re.compile(CASAVA_HEADER_NEW, re.X)
    if re.search(CASAVA_KHMER, seq_id, re.X) is not None:
        return re.compile(CASAVA_KHMER, re.X)
    return None


[docs]def load_fastq(file_handle, num_qual=False):
    """
    .. versionadded:: 0.3.1

    Loads a fastq file and returns a generator of tuples in which the first
    element is the name of the sequence, the second the sequence and the third
    the quality scores (converted in a numpy array if *num_qual* is True).

    .. note::

        this is a simple parser that assumes each sequence is on 4 lines,
        1st and 3rd for the headers, 2nd for the sequence and 4th the quality
        scores

    Arguments:
        file_handle (str, file): fastq file to open, can be a file name or a
            file handle
        num_qual (bool): if *False* (default), the quality score will be
            returned as ASCII character, if *True* a numpy array.

    Yields:
        tuple: first element is the sequence name/header, the second element is
        the sequence, the third is the quality score. The quality scores are
        kept as a string if *num_qual* is False (default) and converted to a
        numpy array with correct values (0-41) if *num_qual* is True

    Raises:
        ValueError: if the headers in both sequence and quality scores are not
        valid. This implies that the sequence/qualities have carriage returns
        or the file is truncated.

        TypeError: if the qualities are in a format different than sanger
        (min 0, max 40) or illumina-1.8 (0, 41)
    """

    file_handle = open_file(file_handle, 'r')

    if getattr(file_handle, 'name', None) is not None:
        LOG.info("Reading fastq file (%s)", file_handle.name)

    check_qual = True

    sequence_count = 0

    while True:
        header1 = file_handle.readline().decode('ascii').strip()
        # Reached the end of the file
        if not header1:
            break

        seq = file_handle.readline().decode('ascii').strip()

        header2 = file_handle.readline().decode('ascii').strip()
        qualities = file_handle.readline().decode('ascii').strip()

        if (header1[0] != '@') or (header2[0] != '+'):
            raise ValueError(
                "The sequence and quality headers are not valid '{}' != '{}'".format(header1, header2)
            )

        header1 = header1[1:]
        header2 = header2[1:]

        if check_qual:
            qual_type = check_fastq_type(qualities)
            if qual_type not in ('sanger', 'illumina-1.8'):
                raise TypeError(
                    'Quality scores are in "{}" format. You must convert them to Sanger (phred33)'.format(qual_type)
                )
            check_qual = False

        if num_qual:
            qualities = numpy.fromiter((ord(x) for x in qualities), numpy.int) - 33

        sequence_count += 1

        yield header1, seq, qualities

    LOG.info("Read %d fastq sequences", sequence_count)


[docs]def load_fastq_rename(file_handle, num_qual=False, name_func=None):
    """
    .. versionadded:: 0.3.3

    Mirrors the same functionality in :func:`mgkit.io.fasta.load_fasta_rename`.
    Renames the header of the sequences using *name_func*, which is called on
    each header. By default, the behaviour is to keep the header to the left of
    the first space (BLAST behaviour).
    """

    if name_func is None:
        def name_func(header): return header.split(' ')[0]

    for header, seq, qual in load_fastq(file_handle, num_qual=num_qual):
        yield name_func(header), seq, qual