Source code for msprime.mutations

#
# Copyright (C) 2018-2021 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime.  If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for generating mutations on a given tree sequence.
"""
import inspect
import math
import numbers
import sys

import numpy as np
import tskit

from . import core
from . import intervals
from . import provenance
from msprime import _msprime

_ACGT_ALLELES = ["A", "C", "G", "T"]
_AMINO_ACIDS = [
    "A",
    "R",
    "N",
    "D",
    "C",
    "Q",
    "E",
    "G",
    "H",
    "I",
    "L",
    "K",
    "M",
    "F",
    "P",
    "S",
    "T",
    "W",
    "Y",
    "V",
]



[docs]
class MutationModel:
    """
    Abstract superclass of msprime mutation models.
    """

    def asdict(self):
        # This version of asdict makes sure that we have sufficient parameters
        # to call the constructor and recreate the class. However, this means
        # that subclasses *must* have an instance variable of the same name as
        # each parameter.
        # This is essential for Provenance round-tripping to work.
        return {
            key: getattr(self, key)
            for key in inspect.signature(self.__init__).parameters.keys()
            if hasattr(self, key)
        }




[docs]
class MatrixMutationModel(_msprime.MatrixMutationModel, MutationModel):
    """
    Superclass of the specific mutation models with a finite set of states.
    This class can be used to create a matrix mutation model with custom
    alleles and probabilities. For details of how this works see
    :ref:`sec_mutations_matrix_mutation_models_details`.

    :param list[str] alleles: A list of the possible alleles generated by the model. Each
        entry is a string.
    :param list[float] root_distribution: An array of float values the same length
        as ``alleles`` which should sum to 1. These values are used to determine the
        ancestral state of each mutated site. For example if ``alleles`` is
        ``['A','B']`` and ``root_distribution`` is ``[1,0]`` all ancestral states will be
        ``'A'``.
    :param list[float] transition_matrix: A square 2d-matrix of transition
        probabilities where both dimensions are equal to the length of the ``alleles``
        argument. These are used to determine the derived state at each mutation. Note
        that rows should sum to 1.
    """

    def __str__(self):
        s = f"Mutation model with alleles {self.alleles}\n"
        s += "  root distribution: {}\n".format(
            " ".join(map(str, self.root_distribution))
        )
        s += "  transition matrix:\n"
        for row in self.transition_matrix:
            s += "     {}\n".format(" ".join(map(str, row)))
        return s




[docs]
class SLiMMutationModel(_msprime.SLiMMutationModel, MutationModel):
    """
    An infinite-alleles model of mutation producing "SLiM-style" mutations.

    To agree with mutations produced by SLiM, the ancestral state of each new
    site is set to the empty string, and each derived state is produced by
    appending the "next allele" to the previous state. The result is that each
    allele is a comma-separated string of all mutations that have occurred up
    to the root.

    Mutations produced by SLiM carry both a ``time`` attribute, in units of "time ago"
    as usual for tskit, as well as a metadata attributed called "origin_generation",
    in units of time since the start of the simulation. Adding these two together -
    time since the start of the simulation plus time until the end - is equal to the
    total number of generations of the simulation. The origin_generation is not
    currently used by SLiM, but for consistency, the origin_generation attribute for
    mutations produced by this model is set equal to ``slim_generation`` minus
    ``floor(mut.time)``, where ``mut.time`` is the (tskit) time ago of the mutation.

    :param int type: The nonnegative integer defining the "type" of SLiM mutation
        that will be recorded in metadata.
    :param int next_id: The nonnegative integer to start assigning alleles from.
        (default: 0)
    :param int slim_generation: The "SLiM generation" time used in determining the
        "origin_generation" metadata attribute of mutations. This can usually
        be left at its default, which is 1.
    :param int block_size: The block size for allocating derived states.
        You do not need to change this unless you get an "out of memory" error
        due to a very large number of stacked mutations.
    """

    def __str__(self):
        return (
            f"Mutation model for SLiM mutations of type m{self.type}\n"
            f"  next ID: {self.next_id}\n"
        )



# NOTE: we use a hacky workaround here for documenting the next_allele instance
# variable because of the nasty sphinx bug:
# https://github.com/sphinx-doc/sphinx/issues/2549
# This seems to be basically impossible to workaround in JupyterBook, so it
# seems best to just avoid using ivars. The correct approach going forward
# would appear to be to use dataclasses and to document each variable
# independently (e.g., Population class).

[docs]
class InfiniteAlleles(_msprime.InfiniteAllelesMutationModel, MutationModel):
    """
    An *infinite alleles* model of mutation in which each allele is a
    unique positive integer. This works by keeping track of a "next
    allele": each time the model is asked to produce a new allele (either for an
    ancestral or derived state), the "next allele" is provided, and then
    incremented.

    **Variables:**

    **next_allele**: The *next* allele to be assigned. This increments by
    one each time an allele is assigned to a new mutation, and resets to
    ``start_allele`` each time a new ancestral state is assigned.

    :param int start_allele: The nonnegative integer to start assigning alleles from.
       (default: 0)
    """

    def __str__(self):
        return (
            f"Infinite alleles mutation model, beginning with allele"
            f" {self.start_allele}\n    next allele: {self.next_allele}\n"
        )




[docs]
class BinaryMutationModel(MatrixMutationModel):
    """
    Basic binary mutation model with two alleles: ``"0"`` and ``"1"``, and ancestral
    allele always ``"0"``.

    This is a :class:`.MatrixMutationModel` with alleles ``["0", "1"]``,
    root distribution ``[1.0, 0.0]``, and transition matrix that is either
    ``[[0.0, 1.0], [1.0, 0.0]]`` (by default), or
    ``[[0.5, 0.5], [0.5, 0.5]]`` (if ``state_independent=True``).

    :param bool state_independent: Whether mutations will be generated in a way
        independent of the previous allelic state, which includes silent mutations.
        See :ref:`sec_mutations_existing` for more details.
    """

    def __init__(self, state_independent=False):
        alleles = ["0", "1"]
        root_distribution = [1, 0]
        if state_independent:
            transition_matrix = [[0.5, 0.5], [0.5, 0.5]]
        else:
            transition_matrix = [[0.0, 1.0], [1.0, 0.0]]
        super().__init__(alleles, root_distribution, transition_matrix)



def general_microsat_rate_matrix(s, u, v, p, m, lo, hi):
    """
    Construct the general microsat rate matrix associated
    with the Sainudiin et al. (2004) parameterization.
    While the original paper implements a continuous time markov chain,
    here we implement the jump process rate matrix equivalent for consistency
    with the :class:`.MatrixMutationModel`.

    In particular if :math:`q_{ij}` is an element of the originally defined
    transition matrix, we define :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
    Then, as a :class:`.MatrixMutationModel` this model has the transition
    matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`.

    :param float s: strength of length dependence on mutation rate
     (i.e. s=0 uniform rate)
    :param float u: constant bias parameter; U ~ (0, 1)
    :param float v: linear bias parameter; v ~ (-inf, inf)
    :param float p: probability of a single step mutation
    :param float m: success prob of truncated gamma distribution
    :param int lo: lower bound on repeat number
    :param int hi: upper bound on repeat number
    :return: rate matrix
    """

    # Helper functions for the Sainudiin et al. rate matrix
    def alpha_microsat(i, u, v, lo):
        return max(0, min(1, u - (v * (i - lo))))

    def beta_microsat(i, s, lo):
        """
        beta helper function from Sainudiin et al. (2004)
        represents the mutation rate from allele i
        :param float s: strength of length dependence on mutation rate
        (i.e. s=0 uniform rate)
        :param int lo: lower bound
        :param int hi: upper bound
        """
        return 1 + ((i - lo) * s)

    def gamma_microsat(m, i, j, lo, hi):
        """
        gamma helper function from Sainudiin et al. (2004)
        :param float m: prob success
        :param int i: in-state
        :param int j: out-state
        :param int lo: lower bound of microsat repeat length distribution
        :param int hi: upper bound of microsat repeat length distribution
        """
        num = m * (1 - m) ** (np.abs(i - j) - 1)
        if lo <= i < j <= hi:
            denom = 1 - (1 - m) ** (hi - i)
        elif lo <= j < i <= hi:
            denom = 1 - (1 - m) ** (i - lo)
        else:
            raise TypeError("microsat model error. i and j must be between lo and hi")

        return num / denom

    nalleles = hi - lo + 1
    rate_matrix = np.zeros((nalleles, nalleles))
    for i in range(lo, hi + 1):
        for j in range(lo, hi + 1):
            # shift coords to account for 0-indexing in rate matrix
            ii = i - lo
            jj = j - lo
            if i == j - 1:
                rate_matrix[ii, jj] = (
                    beta_microsat(i, s, lo)
                    * alpha_microsat(i, u, v, lo)
                    * (p + ((1 - p) * gamma_microsat(m, i, j, lo, hi)))
                )
            elif i < j - 1:
                rate_matrix[ii, jj] = (
                    beta_microsat(i, s, lo)
                    * alpha_microsat(i, u, v, lo)
                    * (1 - p)
                    * gamma_microsat(m, i, j, lo, hi)
                )
            elif i == j + 1:
                rate_matrix[ii, jj] = (
                    beta_microsat(i, s, lo)
                    * (1 - alpha_microsat(i, u, v, lo))
                    * (p + ((1 - p) * gamma_microsat(m, i, j, lo, hi)))
                )
            elif i > j + 1:
                rate_matrix[ii, jj] = (
                    beta_microsat(i, s, lo)
                    * (1 - alpha_microsat(i, u, v, lo))
                    * (1 - p)
                    * gamma_microsat(m, i, j, lo, hi)
                )
            else:
                rate_matrix[ii, jj] = 0
    # scale by max row sum; then normalize to 1
    row_sums = rate_matrix.sum(axis=1, dtype="float64")
    alpha = np.max(row_sums)
    rate_matrix /= alpha
    row_sums = rate_matrix.sum(axis=1, dtype="float64")
    # the max(0, *) is to avoid floating point error
    np.fill_diagonal(rate_matrix, np.fmax(0.0, 1.0 - row_sums))
    return rate_matrix



[docs]
class MicrosatMutationModel(MatrixMutationModel):
    """
    General class for Microsatellite mutation models
    which parameterizes the transition matrix describing
    changes among alleles using five parameters according
    to the model of
    `Sainudiin et al. (2004) <https://doi.org/10.1534/genetics.103.022665>`_

    .. seealso:: See the :ref:`sec_mutations_microsats` section for more details
       and examples.

    Concretely, mutation rates under this model are computed as follows.
    Let ``qij`` be the transition rate from equation (3) in Sainudiin et al,
    and let ``M`` be the largest row sum of the matrix whose i,j-th entry
    is ``qij`` and whose diagonals are zero. Then if the mutation rate
    passed to :func:`.sim_mutations` is ``m``, the mutation rate from
    allele ``i`` to allele ``j`` is ``m * qij / M``.

    Note that the values of ``lo`` and ``hi`` are in units of repeat number,
    and are unrelated to repeat length
    (e.g., the repeat itself can be dinucleotide, tri-nucleotide, etcetera).

    .. warning::
        The default values for ``lo`` and ``hi`` are chosen arbitrarily.
        The default of ``lo=1`` is chosen so that all states might be
        represented, but please note that in empirical data,
        microsatellite loci are often ascertained based on
        the number of repeats observed.
        Please ensure that you set ``lo`` and ``hi``
        to the appropriate values for your organism / locus of interest.

    :param float s: strength of length dependence on mutation rate.
        Defaults to 0, i.e., no length dependence.
    :param float u: Constant bias parameter; must be between 0 and 1.
        Defaults to 0.5, i.e., no bias.
    :param float v: Linear bias parameter. Defaults to 0, i.e., no bias.
    :param float p: Probability of a single step mutation. Defaults to 1.
    :param float m: 1 / (mean multistep mutation size). Defaults to 1.
    :param int lo: Lower bound on repeat number. Defaults to 2.
    :param int hi: Upper bound on repeat number (inclusive). Defaults to 50.
    :param list[float] root_distribution: An array of float values of length hi-lo+1
        which should sum to 1, and give the probabilities used to choose
        the ancestral state of each microsatellite.
        Defaults to the stationary distribution of the model.
    """

    def __init__(
        self,
        *,
        s=None,
        u=None,
        v=None,
        p=None,
        m=None,
        lo=None,
        hi=None,
        root_distribution=None,
    ):
        s = 0.0 if s is None else s
        u = 0.5 if u is None else u
        v = 0.0 if v is None else v
        p = 1.0 if p is None else p
        m = 1.0 if m is None else m
        lo = 1 if lo is None else lo
        hi = 50 if hi is None else hi

        params = [s, u, v, p, m, lo, hi]
        if any(not math.isfinite(x) for x in params):
            raise ValueError("All parameters must be finite")
        if any(not isinstance(x, numbers.Number) for x in params):
            raise ValueError("All parameters must be numeric")

        lo = int(lo)
        hi = int(hi)
        if lo > hi:
            raise ValueError("lo > hi")
        if lo < 0:
            raise ValueError("lo must be >= 0")
        if s <= -1 / (hi - lo + 1):
            raise ValueError("s must be > -1/(hi-lo+1)")
        if u <= 0 or u >= 1:
            raise ValueError("u must be between 0 and 1")
        alleles = [str(int(x)) for x in range(lo, hi + 1)]
        transition_matrix = general_microsat_rate_matrix(s, u, v, p, m, lo, hi)
        if root_distribution is None:
            # solve for stationary distribution
            S, U = np.linalg.eig(transition_matrix.T)
            U = np.real_if_close(U, tol=1)
            stationary = np.array(U[:, np.where(np.abs(S - 1.0) < 1e-8)[0][0]])
            stationary = stationary / np.sum(stationary)
            root_distribution = stationary
        super().__init__(alleles, root_distribution, transition_matrix)




[docs]
class SMM(MicrosatMutationModel):
    """
    Stepwise mutation model (Ohta and Kimura, 1978),
    for microsatellite repeat number.

    This is a :class:`.MicrosatMutationModel` with alleles ``[lo, .. , hi]``,
    a root distribution (stationary distribution by default),
    and a transition matrix that allows only mutations that
    change the number of repeats by +/- 1. Concretely, if the mutation
    rate is ``m``, then an allele ``k`` mutates to ``k+1`` and ``k-1``
    at rate ``m/2`` each,
    except that mutations above ``hi`` or below ``lo`` result in no change.

    :param int lo: Repeat number lower bound. See the documentation for
        :class:`.MicrosatMutationModel` for details.
    :param int hi: Repeat number upper bound. See the documentation for
        :class:`.MicrosatMutationModel` for details.
    :param list[float] root_distribution: See the documentation for
        :class:`.MicrosatMutationModel` for details.
    """

    def __init__(self, lo=None, hi=None, *, root_distribution=None):
        super().__init__(
            s=0, u=0.5, v=0, p=0, m=1, lo=lo, hi=hi, root_distribution=root_distribution
        )




[docs]
class TPM(MicrosatMutationModel):
    """
    Two-phase mutation model of
    `DiRienzo et al. (1994) <https://doi.org/10.1073/pnas.91.8.3166>`_

    This models evolution of microsatellite repeat number in a manner
    that allows for multi-step mutations in copy number of microsatellite repeats.
    This is parameterized by `p` the probability of a single step mutation,
    and `m` the success probability of the truncated gamma distribution
    describing the distribution of longer steps such that the
    mean multi-step mutation is length `1/m`.
    For this model both `p` and `m` need to be set to < 1.0.

    This is a :class:`.MicrosatMutationModel` with alleles ``[lo, .. , hi]``.
    For a precise definition of the mutation rates, see :class:`.MicrosatMutationModel`
    with ``s=0``, ``u=0.5``, and ``v=0``.
    See :class:`.MicrosatMutationModel` for further details of parameterization.

    :param float p: Probability of a single step mutation.
    :param float m: Success prob of truncated gamma distribution.
    :param int lo: Repeat number lower bound. See the documentation for
        :class:`.MicrosatMutationModel` for details.
    :param int hi: Repeat number upper bound. See the documentation for
        :class:`.MicrosatMutationModel` for details.
    :param list[float] root_distribution: See the documentation for
        :class:`.MicrosatMutationModel` for details.
    """

    def __init__(self, *, p, m, lo=None, hi=None, root_distribution=None):
        if not (0 < p < 1.0):
            raise ValueError("p must be between 0 and 1")
        if not (0 < m < 1.0):
            raise ValueError("m must be between 0 and 1")
        super().__init__(
            s=0, u=0.5, v=0, p=p, m=m, lo=lo, hi=hi, root_distribution=root_distribution
        )




[docs]
class EL2(MicrosatMutationModel):
    """
    Equal rate, linear biased, two-phase mutation model of
    `Garza et al. (1995) <10.1093/oxfordjournals.molbev.a040239>`_
    as parameterized by
    `Sainudiin et al. (2004) <https://doi.org/10.1534/genetics.103.022665>`_

    This models evolution of microsatellite repeat number in a manner
    that allows for multi-step mutations in copy number.
    This is parameterized by `m` the success probability of
    the truncated gamma distribution
    describing the distribution of longer steps, such that the
    mean multi-step mutation is length `1/m`. `u` the constant
    bias parameter which determines bias (if any) in expansion or
    contraction of the repeat number, and `v` the linear bias
    parameter which determines mutation rate variation associated
    with repeat number.
    For this model `m` need to be set to < 1.0,
    and `u` needs to be set to between 0 and 1.
    and `v` can take any real, positive or negative value.

    See :class:`.MicrosatMutationModel` for details on the  constant
    bias parameter `u` and a linear bias parameter `v`.

    This is a :class:`.MicrosatMutationModel` with alleles ``[lo, .. , hi]``
    For a precise definition of the mutation rates,
    see :class:`.MicrosatMutationModel`
    with ``s=0``.

    :param float m: Success prob of truncated gamma distribution.
    :param float u: Constant bias parameter.
    :param float v: Linear bias parameter.
    :param int lo: Repeat number lower bound. See the documentation for
        :class:`.MicrosatMutationModel` for details.
    :param int hi: Repeat number upper bound. See the documentation for
        :class:`.MicrosatMutationModel` for details.
    :param list[float] root_distribution: See the documentation for
        :class:`.MicrosatMutationModel` for details.
    """

    def __init__(self, *, m, u, v, lo=None, hi=None, root_distribution=None):
        if not (0 < m < 1.0):
            raise ValueError("m must be between 0 and 1")
        if not (0 < u < 1.0):
            raise ValueError("u must be between 0 and 1")
        super().__init__(
            s=0, u=u, v=v, p=0, m=m, lo=lo, hi=hi, root_distribution=root_distribution
        )




[docs]
class JC69(MatrixMutationModel):
    """
    Jukes-Cantor mutation model (Jukes and Cantor 1969). Based on the standard ACGT
    nucleotides as allelic states, this model assumes equal probabilities for
    ancestral state and equal probabilities for all possible transitions.

    This is a :class:`.MatrixMutationModel` with alleles ``["A", "C", "G", "T"]``,
    root distribution ``[1/4, 1/4, 1/4, 1/4]``, and default transition matrix
    ``[[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3],
    [1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]]``.
    It has no free parameters.

    If ``state_independent`` is True, then instead all entries of the
    transition matrix will be 1/4, so one-quarter of the mutations will be
    silent. See :ref:`sec_mutations_existing` for more details.

    Citation: *Jukes TH, Cantor CR (1969). Evolution of Protein Molecules. pp. 21-132.*

    :param bool state_independent: Whether mutations will be generated in a way
        independent of the previous allelic state.
    """

    def __init__(self, state_independent=False):
        alleles = _ACGT_ALLELES
        root_distribution = [0.25, 0.25, 0.25, 0.25]
        transition_matrix = np.zeros((4, 4))
        if state_independent:
            transition_matrix[:] = 1 / 4
        else:
            transition_matrix[:] = 1 / 3
            np.fill_diagonal(transition_matrix, 0)
        super().__init__(alleles, root_distribution, transition_matrix)




[docs]
class HKY(MatrixMutationModel):
    """
    The Hasegawa, Kishino and Yano mutation model (Hasegawa et al. 1985). Based on the
    standard ACGT nucleotides as allelic states, this model allows different rates for
    transitions and transversions, and sets an equilibrium frequency for each nucleotide.
    In addition a custom ancestral frequency (``root_distribution``) can be specified.
    With ``kappa=1.0`` and the default values of the other arguments this model is equal
    to :class:`.JC69`. This model is similar to :class:`.F84`
    but with a differing parameterisation for ``kappa``.

    This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of
    transition to transversion mutation rates, and :math:`\\pi`
    (``equilibrium_frequencies``), the vector of equilibrium nucleotide
    frequencies. If this mutation model is used with a ``mutation_rate`` of
    :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is
    equal to :math:`Q_{ij}`, where

    :math:`\\mathbf{Q} =
    \\frac{\\mu}{M} \\times
    \\begin{bmatrix}
    \\cdot        &  \\pi_C         & \\kappa \\pi_G & \\pi_T           \\\\
    \\pi_A        &  \\cdot         & \\pi_G        & \\kappa \\pi_T    \\\\
    \\kappa \\pi_A &  \\pi_C         & \\cdot        & \\pi_T           \\\\
    \\pi_A        &  \\kappa \\pi_C  & \\pi_G        & \\cdot
    \\end{bmatrix}`

    Here :math:`M` is an overall scaling factor on the mutation rate that can
    be computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion,
    and :math:`q_{ij} = \\kappa \\pi_j` otherwise. Set :math:`q_{ii} = 0`,
    and then let :math:`M` be the largest row sum of :math:`q`, i.e.,
    :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
    Then, this implementation as a :class:`.MatrixMutationModel` has the transition
    matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`.

    Note also that :math:`\\kappa` is the ratio of *individual* mutation rates,
    not the ratio of *total* transition to transversion rates, which would
    be :math:`\\kappa/2`, as is used in some parameterisations.

    If ``state_independent`` is true, then the above is modified by setting
    :math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent
    only if :math:`\\kappa = 1` by adding additional silent mutations.
    See :ref:`sec_mutations_existing` for more details.

    Citation: *Hasegawa M, Kishino H, Yano T (1985).  "Dating of the human-ape
    splitting by a molecular clock of mitochondrial DNA". Journal of Molecular
    Evolution. 22 (2): 160–74.*

    Note that this implementation has the root distribution as a separate
    parameter, although it defaults to the equilibrium distribution. If you
    set the root distribution to be different than the equilibrium distribution,
    then you have a nonequilibrium model, and you should make sure that's what
    you want.

    :param float kappa: Scaling parameter by which the transition matrix is modified for
        transitions (A<>G, C<>T). For example a value of ``0`` means that no transitions
        will occur, ``1`` makes the probability of transitions and
        transversions equal.
    :param list[float] equilibrium_frequencies: An array of float values of length
        4 which should sum to 1. These values are used to determine equilibrium
        frequencies (long-term averages) of the alleles. Defaults to
        [0.25, 0.25, 0.25, 0.25], i.e., all nucleotides equally likely.
    :param list[float] root_distribution: An array of float values of length 4
        which should sum to 1. These values are used to determine the ancestral state of
        each mutational site. Defaults to the value of ``equilibrium_frequencies``.
    :param bool state_independent: Whether to include a higher rate of silent mutations
        that makes the model closer to state-independent.
    """

    def __init__(
        self,
        kappa,
        equilibrium_frequencies=None,
        root_distribution=None,
        state_independent=False,
    ):
        alleles = _ACGT_ALLELES
        if equilibrium_frequencies is None:
            equilibrium_frequencies = np.array(
                [0.25, 0.25, 0.25, 0.25], dtype="float64"
            )
        else:
            equilibrium_frequencies = np.array(equilibrium_frequencies, dtype="float64")
        if root_distribution is None:
            root_distribution = equilibrium_frequencies.copy()

        transition_matrix = np.full((4, 4), 1.0)
        # positions in transition matrix for transitions
        transition_pos = ((0, 1, 2, 3), (2, 3, 0, 1))
        transition_matrix[transition_pos] = kappa
        transition_matrix *= equilibrium_frequencies
        if state_independent:
            np.fill_diagonal(transition_matrix, kappa * equilibrium_frequencies)
        else:
            np.fill_diagonal(transition_matrix, 0.0)
        row_sums = transition_matrix.sum(axis=1, dtype="float64")
        transition_matrix /= max(row_sums)
        row_sums = transition_matrix.sum(axis=1, dtype="float64")
        np.fill_diagonal(
            transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix))
        )

        super().__init__(alleles, root_distribution, transition_matrix)




[docs]
class F84(MatrixMutationModel):
    """
    The F84 mutation model (Felsenstein and Churchill, 1996). Based on the
    standard ACGT nucleotides as allelic states, this model takes into account
    transitions and transversions, and sets an equilibrium frequency for each nucleotide.
    In addition a custom ancestral frequency (``root_distribution``) can be specified.
    With ``kappa=1.0`` and the default values of the other arguments this model is equal
    to :class:`.JC69`. This model is similar to :class:`.HKY`
    but with a differing parameterisation for ``kappa``.

    This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of
    transition to transversion mutation rates, and :math:`\\pi`
    (``equilibrium_frequencies``), the vector of equilibrium nucleotide
    frequencies. If this mutation model is used with a ``mutation_rate`` of
    :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is
    equal to :math:`Q_{ij}`, where

    :math:`\\mathbf{Q} =
    \\frac{\\mu}{M} \\times
    \\begin{bmatrix}
    \\cdot & \\pi_C
    & \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_G & \\pi_T \\\\
    \\pi_A & \\cdot & \\pi_G
    & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_T \\\\
    \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_A & \\pi_C
    & \\cdot & \\pi_T \\\\
    \\pi_A & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_C
    & \\pi_G & \\cdot
    \\end{bmatrix}`

    Here :math:`M` is an overall scaling factor on the mutation rate that can be
    computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion,
    :math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_A + \\pi_G)) \\pi_j` if i and j are
    A and G in some order,
    :math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_C + \\pi_T)) \\pi_j` if i and j are
    C and T in some order, and :math:`q_{ii} = 0`. Let :math:`M` be the largest
    row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
    Then, this implementation as a :class:`.MatrixMutationModel` has the
    transition matrix :math:`P_{ij} = q_{ij} / M`,
    and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`.

    If ``state_independent`` is true, then the above is modified by setting
    :math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent
    only if :math:`\\kappa = 1` by adding additional silent mutations.
    See :ref:`sec_mutations_existing` for more details.

    Citation: *Felsenstein J, Churchill GA (January 1996). "A Hidden Markov
    Model approach to variation among sites in rate of evolution". Molecular
    Biology and Evolution. 13 (1): 93–104.*

    Note that this implementation has the root distribution as a separate
    parameter, although it defaults to the equilibrium distribution. If you
    set the root distribution to be different than the equilibrium distribution,
    then you have a nonequilibrium model, and you should make sure that's what
    you want.

    :param float kappa: Scaling parameter by which the transition matrix is modified for
        transitions (A<>G, C<>T). Must be greater than or equal to the larger of
        :math:`(\\pi_A+\\pi_G)/(\\pi_C+\\pi_T)`
        and :math:`(\\pi_C+\\pi_T)/(\\pi_A+\\pi_G)`.
    :param list[float] equilibrium_frequencies: An array of float values of length
        4 which should sum to 1. These values are used to determine equilibrium
        frequencies (long-term averages) of the alleles. Defaults to
        [0.25, 0.25, 0.25, 0.25], i.e all nucleotides equally likely.
    :param list[float] root_distribution: An array of float values of length 4
        which should sum to 1. These values are used to determine the ancestral state of
        each mutational site. Defaults to the value of ``equilibrium_frequencies``.
    :param bool state_independent: Whether to include a higher rate of silent mutations
        that makes the model closer to state-independent.
    """

    def __init__(
        self,
        kappa,
        equilibrium_frequencies=None,
        root_distribution=None,
        state_independent=False,
    ):
        alleles = _ACGT_ALLELES
        if equilibrium_frequencies is None:
            equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25]
        if root_distribution is None:
            root_distribution = equilibrium_frequencies.copy()

        transition_matrix = np.full((4, 4), 1.0, dtype="float64")
        # positions in transition matrix for transitions
        transition_pos_AG = ((0, 2), (2, 0))
        transition_pos_CT = ((1, 3), (3, 1))
        p_AG = equilibrium_frequencies[0] + equilibrium_frequencies[2]
        p_CT = equilibrium_frequencies[1] + equilibrium_frequencies[3]
        gamma = np.float64(1 + (kappa - 1) / np.array([p_AG, p_CT]))
        transition_matrix[transition_pos_AG] = gamma[0]
        transition_matrix[transition_pos_CT] = gamma[1]
        transition_matrix *= equilibrium_frequencies
        if state_independent:
            np.fill_diagonal(
                transition_matrix,
                equilibrium_frequencies * np.concatenate([gamma, gamma]),
            )
        else:
            np.fill_diagonal(transition_matrix, 0.0)
        row_sums = transition_matrix.sum(axis=1)
        transition_matrix = transition_matrix / max(row_sums)
        row_sums = transition_matrix.sum(axis=1, dtype="float64")
        np.fill_diagonal(
            transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix))
        )

        super().__init__(alleles, root_distribution, transition_matrix)




[docs]
class GTR(MatrixMutationModel):
    """
    The Generalised Time-Reversible nucleotide mutation model, a general
    parameterisation of a time-reversible mutation process (Tavaré et al.
    1986). It allows specification of per-nucleotide equilibrium frequencies
    and equilibrium transition rates.

    This model is parameterised by the vector :math:`r` (``relative_rates``),
    and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide
    frequencies. The entries of ``relative_rates`` are, in this order,
    :math:`(r_{AC}, r_{AG}, r_{AT}, r_{CG}, r_{CT}, r_{GT})`. If this mutation model
    is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the
    i-th to the j-th allele is :math:`Q_{ij}`, where

    :math:`\\mathbf{Q} =
    \\frac{\\mu}{M} \\times
    \\begin{bmatrix}
    \\cdot        &  r_{AC} \\pi_C  & r_{AG} \\pi_G & r_{AT} \\pi_T    \\\\
    r_{AC} \\pi_A &  \\cdot         & r_{CG} \\pi_G & r_{CT} \\pi_T    \\\\
    r_{AG} \\pi_A &  r_{CG} \\pi_C  & \\cdot        & r_{GT} \\pi_T    \\\\
    r_{AT} \\pi_A &  r_{CT} \\pi_C  & r_{GT} \\pi_G & \\cdot
    \\end{bmatrix}`

    Here :math:`M` is an overall scaling factor on the mutation rate that can be
    computed as follows: let :math:`q_{ij} = r_{ij} \\pi_j`
    and :math:`q_{ii} = 0`,
    and then let :math:`M` be the largest row sum of :math:`q`,
    i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
    Then, this implementation as a :class:`.MatrixMutationModel` has
    the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii}`
    chosen so rows sum to one.

    If ``state_independent`` is true, then the above is modified by setting
    :math:`q_{ii} = \\pi_i`. This makes the model completely state-independent
    only if all :math:`r_{ij} = 1` by adding additional silent mutations.
    See :ref:`sec_mutations_existing` for more details.

    Citation: *Tavaré S (1986). "Some Probabilistic and Statistical Problems in the
    Analysis of DNA Sequences". Lectures on Mathematics in the Life Sciences. 17:
    57–86.*

    Note that this implementation has the root distribution as a separate
    parameter, although it defaults to the equilibrium distribution. If you
    set the root distribution to be different than the equilibrium distribution,
    then you have a nonequilibrium model, and you should make sure that's what
    you want.

    :param list[float] relative_rates: Controls the relative rates of
        mutation between each pair of nucleotides, in order "A<>C", "A<>G", "A<>T",
        "C<>G", "C<>T", and "G<>T".
    :param list[float] equilibrium_frequencies: The equilibrium base
        frequencies, a list of four probabilities that sum to one. (Default: equal
        frequencies.)
    :param list[float] root_distribution: An array of float values of length 4
        which should sum to 1. These values are used to determine the ancestral state of
        each mutational site. Defaults to the value of ``equilibrium_frequencies``.
    :param bool state_independent: Whether to include a higher rate of silent mutations
        that makes the model closer to state-independent.
    """

    def __init__(
        self,
        relative_rates,
        equilibrium_frequencies=None,
        root_distribution=None,
        state_independent=False,
    ):
        alleles = _ACGT_ALLELES
        assert len(relative_rates) == 6
        if equilibrium_frequencies is None:
            equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25]
        if root_distribution is None:
            root_distribution = equilibrium_frequencies

        transition_matrix = np.zeros((4, 4))
        # relative_rates: [A->C, A->G, A->T, C->G, C->T, G->T]
        tri_upper = np.triu_indices_from(transition_matrix, k=1)
        transition_matrix[tri_upper] = relative_rates
        transition_matrix += transition_matrix.T
        if state_independent:
            np.fill_diagonal(transition_matrix, 1.0)
        transition_matrix *= equilibrium_frequencies
        row_sums = transition_matrix.sum(axis=1)
        transition_matrix = transition_matrix / max(row_sums)
        row_sums = transition_matrix.sum(axis=1, dtype="float64")
        np.fill_diagonal(transition_matrix, np.diag(transition_matrix) + 1.0 - row_sums)

        super().__init__(alleles, root_distribution, transition_matrix)




[docs]
class BLOSUM62(MatrixMutationModel):
    """
    The BLOSUM62 model of time-reversible amino acid mutation. This model has
    no free parameters.

    The model is parameterised by a 20-by-20 symmetric matrix of relative rates,
    :math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi`
    (for the precise order, see this model's `.alleles` attribute).
    If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then
    the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where

    :math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,`

    where :math:`M` is an overall scaling factor chosen so that the largest
    row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one.
    In this model, :math:`M = 1.726203705809619`.
    This implementation as a :class:`.MatrixMutationModel` has transition matrix
    :math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one,
    and root distribution equal to the equilibrium frequencies, :math:`\\pi`,
    so the matrix :math:`B` can be recovered as follows:

    .. code-block:: python

       model = msprime.BLOSUM62()
       M = 1.726203705809619
       B = M * model.transition_matrix / model.root_distribution

    Citation: The values of :math:`B` and :math:`\\pi` used here were copied from
    `Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_.
    The original citation is: *Henikoff, S., and J. G. Henikoff. 1992. PNAS USA
    89:10915-10919,* and the numerical values were provided in: *Yu,Y.-K.,
    Wootton, J.C. and Altschul, S.F. (2003) The compositional adjustment of amino
    acid substitution matrices. Proc. Natl Acad. Sci., USA, 100, 15688–15693.*
    """

    def __init__(self):
        alleles = _AMINO_ACIDS
        num_alleles = len(alleles)
        root_distribution = [
            0.074,
            0.052,
            0.045,
            0.054,
            0.025,
            0.034,
            0.054,
            0.074,
            0.026,
            0.068,
            0.099,
            0.058,
            0.025,
            0.047,
            0.039,
            0.057,
            0.051,
            0.013,
            0.032,
            0.073,
        ]
        relative_rates = [
            0.735790389698,
            0.485391055466,
            1.297446705134,
            0.543161820899,
            0.500964408555,
            3.180100048216,
            1.459995310470,
            0.227826574209,
            0.397358949897,
            0.240836614802,
            1.199705704602,
            3.020833610064,
            1.839216146992,
            1.190945703396,
            0.329801504630,
            1.170949042800,
            1.360574190420,
            1.240488508640,
            3.761625208368,
            0.140748891814,
            5.528919177928,
            1.955883574960,
            0.418763308518,
            1.355872344485,
            0.798473248968,
            0.418203192284,
            0.609846305383,
            0.423579992176,
            0.716241444998,
            1.456141166336,
            2.414501434208,
            0.778142664022,
            0.354058109831,
            2.435341131140,
            1.626891056982,
            0.539859124954,
            0.605899003687,
            0.232036445142,
            0.283017326278,
            0.418555732462,
            0.774894022794,
            0.236202451204,
            0.186848046932,
            0.189296292376,
            0.252718447885,
            0.800016530518,
            0.622711669692,
            0.211888159615,
            0.218131577594,
            0.831842640142,
            0.580737093181,
            0.372625175087,
            0.217721159236,
            0.348072209797,
            3.890963773304,
            1.295201266783,
            5.411115141489,
            1.593137043457,
            1.032447924952,
            0.285078800906,
            3.945277674515,
            2.802427151679,
            0.752042440303,
            1.022507035889,
            0.406193586642,
            0.445570274261,
            1.253758266664,
            0.983692987457,
            0.648441278787,
            0.222621897958,
            0.767688823480,
            2.494896077113,
            0.555415397470,
            0.459436173579,
            0.984311525359,
            3.364797763104,
            6.030559379572,
            1.073061184332,
            0.492964679748,
            0.371644693209,
            0.354861249223,
            0.281730694207,
            0.441337471187,
            0.144356959750,
            0.291409084165,
            0.368166464453,
            0.714533703928,
            1.517359325954,
            2.064839703237,
            0.266924750511,
            1.773855168830,
            1.173275900924,
            0.448133661718,
            0.494887043702,
            0.730628272998,
            0.356008498769,
            0.858570575674,
            0.926563934846,
            0.504086599527,
            0.527007339151,
            0.388355409206,
            0.374555687471,
            1.047383450722,
            0.454123625103,
            0.233597909629,
            4.325092687057,
            1.122783104210,
            2.904101656456,
            1.582754142065,
            1.197188415094,
            1.934870924596,
            1.769893238937,
            1.509326253224,
            1.117029762910,
            0.357544412460,
            0.352969184527,
            1.752165917819,
            0.918723415746,
            0.540027644824,
            1.169129577716,
            1.729178019485,
            0.914665954563,
            1.898173634533,
            0.934187509431,
            1.119831358516,
            1.277480294596,
            1.071097236007,
            0.641436011405,
            0.585407090225,
            1.179091197260,
            0.915259857694,
            1.303875200799,
            1.488548053722,
            0.488206118793,
            1.005451683149,
            5.151556292270,
            0.465839367725,
            0.426382310122,
            0.191482046247,
            0.145345046279,
            0.527664418872,
            0.758653808642,
            0.407635648938,
            0.508358924638,
            0.301248600780,
            0.341985787540,
            0.691474634600,
            0.332243040634,
            0.888101098152,
            2.074324893497,
            0.252214830027,
            0.387925622098,
            0.513128126891,
            0.718206697586,
            0.720517441216,
            0.538222519037,
            0.261422208965,
            0.470237733696,
            0.958989742850,
            0.596719300346,
            0.308055737035,
            4.218953969389,
            0.674617093228,
            0.811245856323,
            0.717993486900,
            0.951682162246,
            6.747260430801,
            0.369405319355,
            0.796751520761,
            0.801010243199,
            4.054419006558,
            2.187774522005,
            0.438388343772,
            0.312858797993,
            0.258129289418,
            1.116352478606,
            0.530785790125,
            0.524253846338,
            0.253340790190,
            0.201555971750,
            8.311839405458,
            2.231405688913,
            0.498138475304,
            2.575850755315,
            0.838119610178,
            0.496908410676,
            0.561925457442,
            2.253074051176,
            0.266508731426,
            1.000000000000,
        ]
        transition_matrix = np.zeros((num_alleles, num_alleles))
        tril = np.tril_indices(num_alleles, k=-1)
        transition_matrix[tril] = relative_rates
        transition_matrix += np.tril(transition_matrix).T
        transition_matrix *= root_distribution
        row_sums = transition_matrix.sum(axis=1)
        transition_matrix = transition_matrix / max(row_sums)
        row_sums = transition_matrix.sum(axis=1, dtype="float64")
        np.fill_diagonal(transition_matrix, 1.0 - row_sums)
        super().__init__(alleles, root_distribution, transition_matrix)




[docs]
class PAM(MatrixMutationModel):
    """
    The PAM model of time-reversible amino acid mutation. This model has no
    free parameters.

    The model is parameterised by a 20-by-20 symmetric matrix of relative rates,
    :math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi`
    (for the precise order, see this model's `.alleles` attribute).
    If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then
    the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where

    :math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,`

    where :math:`M` is an overall scaling factor chosen so that the largest
    row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one.
    In this model, :math:`M = 1.78314862248`.
    This implementation as a :class:`.MatrixMutationModel` has transition matrix
    :math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one,
    and root distribution equal to the equilibrium frequencies, :math:`\\pi`,
    so the matrix :math:`B` can be recovered as follows:

    .. code-block:: python

       model = msprime.PAM()
       M = 1.78314862248
       B = M * model.transition_matrix / model.root_distribution

    Citation: The values of :math:`B` and :math:`\\pi` used here were copied from
    `Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_,
    and follow the "Dayhoff DCMut" model as described in *Kosiol, C., and Goldman,
    N. 2005. Different versions of the Dayhoff rate matrix. Molecular Biology and
    Evolution 22:193-199.* The original citation is: *Dayhoff, M.O., Schwartz,
    R.M., Orcutt, B.C. (1978). A model of evolutionary change in proteins. Atlas of
    Protein Sequence Structures, Vol. 5, Suppl. 3, National Biomedical Research
    Foundation, Washington DC, pp. 345-352.*
    """

    def __init__(self):
        alleles = _AMINO_ACIDS
        num_alleles = len(alleles)
        root_distribution = [
            0.087127,
            0.040904,
            0.040432,
            0.046872,
            0.033474,
            0.038255,
            0.049530,
            0.088612,
            0.033619,
            0.036886,
            0.085357,
            0.080481,
            0.014753,
            0.039772,
            0.050680,
            0.069577,
            0.058542,
            0.010494,
            0.029916,
            0.064717,
        ]
        relative_rates = [
            0.267828,
            0.984474,
            0.327059,
            1.199805,
            0.000000,
            8.931515,
            0.360016,
            0.232374,
            0.000000,
            0.000000,
            0.887753,
            2.439939,
            1.028509,
            1.348551,
            0.000000,
            1.961167,
            0.000000,
            1.493409,
            11.388659,
            0.000000,
            7.086022,
            2.386111,
            0.087791,
            1.385352,
            1.240981,
            0.107278,
            0.281581,
            0.811907,
            0.228116,
            2.383148,
            5.290024,
            0.868241,
            0.282729,
            6.011613,
            0.439469,
            0.106802,
            0.653416,
            0.632629,
            0.768024,
            0.239248,
            0.438074,
            0.180393,
            0.609526,
            0.000000,
            0.076981,
            0.406431,
            0.154924,
            0.341113,
            0.000000,
            0.000000,
            0.730772,
            0.112880,
            0.071514,
            0.443504,
            2.556685,
            0.258635,
            4.610124,
            3.148371,
            0.716913,
            0.000000,
            1.519078,
            0.830078,
            0.267683,
            0.270475,
            0.460857,
            0.180629,
            0.717840,
            0.896321,
            0.000000,
            0.000000,
            0.000000,
            1.127499,
            0.304803,
            0.170372,
            0.000000,
            3.332732,
            5.230115,
            2.411739,
            0.183641,
            0.136906,
            0.138503,
            0.000000,
            0.000000,
            0.000000,
            0.000000,
            0.153478,
            0.475927,
            1.951951,
            1.565160,
            0.000000,
            0.921860,
            2.485920,
            1.028313,
            0.419244,
            0.133940,
            0.187550,
            1.526188,
            0.507003,
            0.347153,
            0.933709,
            0.119152,
            0.316258,
            0.335419,
            0.170205,
            0.110506,
            4.051870,
            1.531590,
            4.885892,
            0.956097,
            1.598356,
            0.561828,
            0.793999,
            2.322243,
            0.353643,
            0.247955,
            0.171432,
            0.954557,
            0.619951,
            0.459901,
            2.427202,
            3.680365,
            0.265745,
            2.271697,
            0.660930,
            0.162366,
            0.525651,
            0.340156,
            0.306662,
            0.226333,
            1.900739,
            0.331090,
            1.350599,
            1.031534,
            0.136655,
            0.782857,
            5.436674,
            0.000000,
            2.001375,
            0.224968,
            0.000000,
            0.000000,
            0.000000,
            0.000000,
            0.000000,
            0.270564,
            0.000000,
            0.461776,
            0.000000,
            0.000000,
            0.762354,
            0.000000,
            0.740819,
            0.000000,
            0.244139,
            0.078012,
            0.946940,
            0.000000,
            0.953164,
            0.000000,
            0.214717,
            0.000000,
            1.265400,
            0.374834,
            0.286572,
            0.132142,
            0.000000,
            6.952629,
            0.000000,
            0.336289,
            0.417839,
            0.608070,
            2.059564,
            0.240368,
            0.158067,
            0.178316,
            0.484678,
            0.346983,
            0.367250,
            0.538165,
            0.438715,
            8.810038,
            1.745156,
            0.103850,
            2.565955,
            0.123606,
            0.485026,
            0.303836,
            1.561997,
            0.000000,
            0.279379,
        ]
        transition_matrix = np.zeros((num_alleles, num_alleles))
        tril = np.tril_indices(num_alleles, k=-1)
        transition_matrix[tril] = relative_rates
        transition_matrix += np.tril(transition_matrix).T
        transition_matrix *= root_distribution
        row_sums = transition_matrix.sum(axis=1)
        transition_matrix = transition_matrix / max(row_sums)
        row_sums = transition_matrix.sum(axis=1, dtype="float64")
        np.fill_diagonal(transition_matrix, 1.0 - row_sums)
        super().__init__(alleles, root_distribution, transition_matrix)



# Pre 1.0, we had these constants to define the alphabet for the
# simple infinite sites mutations. These should be deprecated and removed
# along with the InfiniteSites class.
BINARY = 0
NUCLEOTIDES = 1



[docs]
class InfiniteSites(MatrixMutationModel):
    # This mutation model is defined for backwards compatibility, and is a remnant
    # of an earlier design.
    def __init__(self, alphabet=BINARY):
        self.alphabet = alphabet
        models = {BINARY: BinaryMutationModel(), NUCLEOTIDES: JC69()}
        if alphabet not in models:
            raise ValueError("Bad alphabet")
        model = models[alphabet]
        super().__init__(
            model.alleles, model.root_distribution, model.transition_matrix
        )



def _simple_mutate(
    *,
    tables,
    sequence_length,
    rate,
    random_generator,
):
    """
    Generate mutations directly using the low-level interfaces. Only
    for internal use with the ms and old-style simulate() interfaces.
    """
    rate_map = intervals.RateMap.uniform(sequence_length, rate)
    _msprime.sim_mutations(
        tables,
        random_generator,
        rate_map.asdict(),
        model=BinaryMutationModel(),
        discrete_genome=False,
    )



[docs]
def mutate(
    tree_sequence,
    rate=None,
    random_seed=None,
    model=None,
    keep=False,
    start_time=None,
    end_time=None,
):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in
    measured generations. Mutations are generated under the infinite sites
    model, and so the rate of new mutations is per unit of sequence length per
    generation.

    .. note:: This function is deprecated and we recommend new code use
        the :func:`sim_mutations` function instead, which is similar
        but offers more functionality and defaults to simulating mutations
        on a discrete genome. This function will be supported indefinitely,
        however.

    If a random seed is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    If the ``model`` parameter is specified, this determines the model under
    which mutations are generated. By default mutations from the
    infinite sites model with a binary alphabet are generated.

    By default, sites and mutations in the parameter tree sequence are
    discarded. If the ``keep`` parameter is true, however, *additional*
    mutations are simulated. Under the infinite sites mutation model, all new
    mutations generated will occur at distinct positions from each other and
    from any existing mutations (by rejection sampling).

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    :param tskit.TreeSequence tree_sequence: The tree sequence onto which we
        wish to throw mutations.
    :param float rate: The rate of mutation per generation. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
    :param MutationModel model: The mutation model to use when generating
        mutations. If not specified or None, the :class:`.InfiniteSites`
        mutation model is used.
    :param bool keep: Whether to keep existing mutations (default: False).
    :param float start_time: The minimum time at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time at which a mutation can occur
        (Default: no restriction).
    :return: The :class:`tskit.TreeSequence` object  resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    # It's probably an error if someone tries to use a new-style model
    # with the old mutate() function, and so we raise an error if anyone
    # does, to help them move to the new interface.
    if model is not None and not isinstance(model, InfiniteSites):
        raise ValueError(
            "This is a legacy interface which does not support general "
            "mutation models. Please use the sim_ancestry function instead."
        )
    # The legacy function simulates 0/1 alleles by default.
    if model is None:
        model = BinaryMutationModel()
    return sim_mutations(
        tree_sequence,
        rate=rate,
        random_seed=random_seed,
        model=model,
        keep=keep,
        start_time=start_time,
        end_time=end_time,
        discrete_genome=False,
    )



# String names here should match Seq-Gen where possible
MODEL_MAP = {
    "infinite_alleles": InfiniteAlleles,
    "binary": BinaryMutationModel,
    "jc69": JC69,
    "blosum62": BLOSUM62,
    "pam": PAM,
    # "slim": SLiMMutationModel(), Needs type argument so can't be string init'd
    # "hky": HKY(), Needs kappa argument
    # "f84": F84(), Needs kappa argument
    # "gtr": GTR(), Needs relative_rates argument
}


def mutation_model_factory(model):
    """
    Returns a mutation model corresponding to the specified model.
    - If model is None, the default mutation model is returned.
    - If model is a string, return the corresponding model instance.
    - If model is an instance of MutationModel, return it.
    - Otherwise raise a type error.
    """

    if model is None:
        model_instance = JC69()
    elif isinstance(model, str):
        lower_model = model.lower()
        if lower_model not in MODEL_MAP:
            raise ValueError(
                "Model '{}' unknown. Choose from {}".format(
                    model, sorted(MODEL_MAP.keys())
                )
            )
        model_instance = MODEL_MAP[lower_model]()
    elif isinstance(model, MutationModel):
        model_instance = model
    else:
        raise TypeError(
            "Mutation model must be a string or an instance of MutationModel"
        )
    return model_instance



[docs]
def sim_mutations(
    tree_sequence,
    rate=None,
    *,
    random_seed=None,
    model=None,
    start_time=None,
    end_time=None,
    discrete_genome=None,
    keep=None,
    record_provenance=True,
):
    """
    Simulates mutations on the specified ancestry and returns the resulting
    :class:`tskit.TreeSequence`. Mutations are generated at the specified rate
    per unit of sequence length, per unit of time. By default, mutations are
    generated at discrete sites along the genome and multiple mutations
    can occur at any given site. A continuous sequence, infinite-sites model
    can also be specified by setting the ``discrete_genome`` parameter to
    False.

    If the ``model`` parameter is specified, this determines the model
    of sequence evolution under which mutations are generated.
    The default mutation model is the :class:`msprime.JC69`,
    a symmetrical mutation model among the ACGT alleles.
    See the :ref:`sec_mutations_models` section for details of available models.

    If a ``random_seed`` is specified, this is used to seed the random number
    generator. If the same seed is specified and all other parameters are equal
    then the same mutations will be generated. If no random seed is specified
    then one is generated automatically.

    The time interval over which mutations can occur may be controlled
    using the ``start_time`` and ``end_time`` parameters. The ``start_time``
    defines the lower bound (in time-ago) on this interval and ``max_time``
    the upper bound. Note that we may have mutations associated with
    nodes with time <= ``start_time`` since mutations store the node at the
    bottom (i.e., towards the leaves) of the branch that they occur on.

    If the tree sequence already has mutations, these are by default retained,
    but can be discarded by passing ``keep=False``. However, adding new
    mutations to a tree sequence with existing mutations must be done with
    caution, since it can lead to incorrect or nonsensical results if mutation
    probabilities differ by ancestral state. (As an extreme example, suppose
    that X->Y and X->Z are allowable transitions, but Y->Z is not. If a branch
    already has an X->Y mutation on it, then calling `sim_mutations(...,
    keep=True)` might insert an X->Z mutation above the existing mutation, thus
    implying the impossible chain X->Y->Z.)  However, the effect on nucleotide
    models of mutation are generally very small.

    .. note:: Many mutation models will insert silent transitions (e.g.,
        placing a mutation to A above an existing mutation to A). Such mutations
        are harmless and are required for us to guarantee the statistical
        properties of the process of sequentially adding mutations to a tree
        sequence.

    :param tskit.TreeSequence tree_sequence: The tree sequence we
        wish to throw mutations onto.
    :param float rate: The rate of mutation per unit of sequence length
        per unit time, as either a single number (for a uniform rate) or as a
        :class:`.RateMap`. (Default: 0).
    :param int random_seed: The random seed. If this is `None`, a
        random seed will be automatically generated. Valid random
        seeds must be between 1 and :math:`2^{32} - 1`.
        See the :ref:`sec_randomness_seeds` section for usage examples.
    :param MutationModel model: The mutation model to use when generating
        mutations. This can either be a string (e.g., ``"jc69"``) or
        an instance of a simulation model class
        e.g, ``msprime.F84(kappa=0.5)``.
        If not specified or None, the :class:`.JC69`
        mutation model is used. Please see the
        :ref:`sec_mutations_models` section for more details
        on specifying mutation models.
    :param float start_time: The minimum time ago at which a mutation can
        occur. (Default: no restriction.)
    :param float end_time: The maximum time ago at which a mutation can occur
        (Default: no restriction).
    :param bool discrete_genome: Whether to generate mutations at only integer positions
        along the genome (Default=True).
    :param bool keep: Whether to keep existing mutations. (default: True)
    :param bool record_provenance: If True, record all input parameters
        in the tree sequence :ref:`tskit:sec_provenance`.
    :return: The :class:`tskit.TreeSequence` object resulting from overlaying
        mutations on the input tree sequence.
    :rtype: :class:`tskit.TreeSequence`
    """
    try:
        tables = tree_sequence.tables
    except AttributeError:
        raise ValueError("First argument must be a TreeSequence instance.")
    seed = random_seed
    if random_seed is None:
        seed = core.get_random_seed()
    else:
        seed = int(seed)

    provenance_dict = None
    if record_provenance:
        parameters = dict(
            command="sim_mutations",
            tree_sequence=tree_sequence,
            rate=rate,
            model=model,
            start_time=start_time,
            end_time=end_time,
            discrete_genome=discrete_genome,
            keep=keep,
            random_seed=seed,
        )
        provenance_dict = provenance.get_provenance_dict(parameters)

    if rate is None:
        rate = 0
    try:
        rate = float(rate)
        rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate)
    except TypeError:
        rate_map = rate
    if not isinstance(rate_map, intervals.RateMap):
        raise TypeError("rate must be a float or a RateMap")

    if tables.time_units == tskit.TIME_UNITS_UNCALIBRATED:
        raise ValueError(
            "Simulating mutations doesn't make sense when time is uncalibrated"
        )

    start_time = -sys.float_info.max if start_time is None else float(start_time)
    end_time = sys.float_info.max if end_time is None else float(end_time)
    if start_time > end_time:
        raise ValueError("start_time must be <= end_time")
    discrete_genome = core._parse_flag(discrete_genome, default=True)
    keep = core._parse_flag(keep, default=True)

    model = mutation_model_factory(model)
    rng = _msprime.RandomGenerator(seed)
    lwt = _msprime.LightweightTableCollection()
    lwt.fromdict(tables.asdict())
    _msprime.sim_mutations(
        tables=lwt,
        random_generator=rng,
        rate_map=rate_map.asdict(),
        model=model,
        discrete_genome=discrete_genome,
        keep=keep,
        start_time=start_time,
        end_time=end_time,
    )

    tables = tskit.TableCollection.fromdict(lwt.asdict())
    if provenance_dict is not None:
        tables.provenances.add_row(provenance.json_encode_provenance(provenance_dict))
    return tables.tree_sequence()