Source code for msprime.mutations

#
# Copyright (C) 2018-2021 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime.  If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for generating mutations on a given tree sequence.
"""
import inspect
import math
import numbers
import sys

import numpy as np
import tskit

from . import core
from . import intervals
from . import provenance
from msprime import _msprime

_ACGT_ALLELES = ["A", "C", "G", "T"]
_AMINO_ACIDS = [
    "A",
    "R",
    "N",
    "D",
    "C",
    "Q",
    "E",
    "G",
    "H",
    "I",
    "L",
    "K",
    "M",
    "F",
    "P",
    "S",
    "T",
    "W",
    "Y",
    "V",
]


[docs]class MutationModel: """ Abstract superclass of msprime mutation models. """ def asdict(self): # This version of asdict makes sure that we have sufficient parameters # to call the constructor and recreate the class. However, this means # that subclasses *must* have an instance variable of the same name as # each parameter. # This is essential for Provenance round-tripping to work. return { key: getattr(self, key) for key in inspect.signature(self.__init__).parameters.keys() if hasattr(self, key) }
[docs]class MatrixMutationModel(_msprime.MatrixMutationModel, MutationModel): """ Superclass of the specific mutation models with a finite set of states. This class can be used to create a matrix mutation model with custom alleles and probabilities. For details of how this works see :ref:`sec_mutations_matrix_mutation_models_details`. :param list[str] alleles: A list of the possible alleles generated by the model. Each entry is a string. :param list[float] root_distribution: An array of float values the same length as ``alleles`` which should sum to 1. These values are used to determine the ancestral state of each mutated site. For example if ``alleles`` is ``['A','B']`` and ``root_distribution`` is ``[1,0]`` all ancestral states will be ``'A'``. :param list[float] transition_matrix: A square 2d-matrix of transition probabilities where both dimensions are equal to the length of the ``alleles`` argument. These are used to determine the derived state at each mutation. Note that rows should sum to 1. """ def __str__(self): s = f"Mutation model with alleles {self.alleles}\n" s += " root distribution: {}\n".format( " ".join(map(str, self.root_distribution)) ) s += " transition matrix:\n" for row in self.transition_matrix: s += " {}\n".format(" ".join(map(str, row))) return s
[docs]class SLiMMutationModel(_msprime.SLiMMutationModel, MutationModel): """ An infinite-alleles model of mutation producing "SLiM-style" mutations. To agree with mutations produced by SLiM, the ancestral state of each new site is set to the empty string, and each derived state is produced by appending the "next allele" to the previous state. The result is that each allele is a comma-separated string of all mutations that have occurred up to the root. Mutations produced by SLiM carry both a ``time`` attribute, in units of "time ago" as usual for tskit, as well as a metadata attributed called "origin_generation", in units of time since the start of the simulation. Adding these two together - time since the start of the simulation plus time until the end - is equal to the total number of generations of the simulation. The origin_generation is not currently used by SLiM, but for consistency, the origin_generation attribute for mutations produced by this model is set equal to ``slim_generation`` minus ``floor(mut.time)``, where ``mut.time`` is the (tskit) time ago of the mutation. :param int type: The nonnegative integer defining the "type" of SLiM mutation that will be recorded in metadata. :param int next_id: The nonnegative integer to start assigning alleles from. (default: 0) :param int slim_generation: The "SLiM generation" time used in determining the "origin_generation" metadata attribute of mutations. This can usually be left at its default, which is 1. :param int block_size: The block size for allocating derived states. You do not need to change this unless you get an "out of memory" error due to a very large number of stacked mutations. """ def __str__(self): return ( f"Mutation model for SLiM mutations of type m{self.type}\n" f" next ID: {self.next_id}\n" )
# NOTE: we use a hacky workaround here for documenting the next_allele instance # variable because of the nasty sphinx bug: # https://github.com/sphinx-doc/sphinx/issues/2549 # This seems to be basically impossible to workaround in JupyterBook, so it # seems best to just avoid using ivars. The correct approach going forward # would appear to be to use dataclasses and to document each variable # independently (e.g., Population class).
[docs]class InfiniteAlleles(_msprime.InfiniteAllelesMutationModel, MutationModel): """ An *infinite alleles* model of mutation in which each allele is a unique positive integer. This works by keeping track of a "next allele": each time the model is asked to produce a new allele (either for an ancestral or derived state), the "next allele" is provided, and then incremented. **Variables:** **next_allele**: The *next* allele to be assigned. This increments by one each time an allele is assigned to a new mutation, and resets to ``start_allele`` each time a new ancestral state is assigned. :param int start_allele: The nonnegative integer to start assigning alleles from. (default: 0) """ def __str__(self): return ( f"Infinite alleles mutation model, beginning with allele" f" {self.start_allele}\n next allele: {self.next_allele}\n" )
[docs]class BinaryMutationModel(MatrixMutationModel): """ Basic binary mutation model with two alleles: ``"0"`` and ``"1"``, and ancestral allele always ``"0"``. This is a :class:`.MatrixMutationModel` with alleles ``["0", "1"]``, root distribution ``[1.0, 0.0]``, and transition matrix that is either ``[[0.0, 1.0], [1.0, 0.0]]`` (by default), or ``[[0.5, 0.5], [0.5, 0.5]]`` (if ``state_independent=True``). :param bool state_independent: Whether mutations will be generated in a way independent of the previous allelic state, which includes silent mutations. See :ref:`sec_mutations_existing` for more details. """ def __init__(self, state_independent=False): alleles = ["0", "1"] root_distribution = [1, 0] if state_independent: transition_matrix = [[0.5, 0.5], [0.5, 0.5]] else: transition_matrix = [[0.0, 1.0], [1.0, 0.0]] super().__init__(alleles, root_distribution, transition_matrix)
def general_microsat_rate_matrix(s, u, v, p, m, lo, hi): """ Construct the general microsat rate matrix associated with the Sainudiin et al. (2004) parameterization. While the original paper implements a continuous time markov chain, here we implement the jump process rate matrix equivalent for consistency with the :class:`.MatrixMutationModel`. In particular if :math:`q_{ij}` is an element of the originally defined transition matrix, we define :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, as a :class:`.MatrixMutationModel` this model has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`. :param float s: strength of length dependence on mutation rate (i.e. s=0 uniform rate) :param float u: constant bias parameter; U ~ (0, 1) :param float v: linear bias parameter; v ~ (-inf, inf) :param float p: probability of a single step mutation :param float m: success prob of truncated gamma distribution :param int lo: lower bound on repeat number :param int hi: upper bound on repeat number :return: rate matrix """ # Helper functions for the Sainudiin et al. rate matrix def alpha_microsat(i, u, v, lo): return max(0, min(1, u - (v * (i - lo)))) def beta_microsat(i, s, lo): """ beta helper function from Sainudiin et al. (2004) represents the mutation rate from allele i :param float s: strength of length dependence on mutation rate (i.e. s=0 uniform rate) :param int lo: lower bound :param int hi: upper bound """ return 1 + ((i - lo) * s) def gamma_microsat(m, i, j, lo, hi): """ gamma helper function from Sainudiin et al. (2004) :param float m: prob success :param int i: in-state :param int j: out-state :param int lo: lower bound of microsat repeat length distribution :param int hi: upper bound of microsat repeat length distribution """ num = m * (1 - m) ** (np.abs(i - j) - 1) if lo <= i < j <= hi: denom = 1 - (1 - m) ** (hi - i) elif lo <= j < i <= hi: denom = 1 - (1 - m) ** (i - lo) else: raise TypeError("microsat model error. i and j must be between lo and hi") return num / denom nalleles = hi - lo + 1 rate_matrix = np.zeros((nalleles, nalleles)) for i in range(lo, hi + 1): for j in range(lo, hi + 1): # shift coords to account for 0-indexing in rate matrix ii = i - lo jj = j - lo if i == j - 1: rate_matrix[ii, jj] = ( beta_microsat(i, s, lo) * alpha_microsat(i, u, v, lo) * (p + ((1 - p) * gamma_microsat(m, i, j, lo, hi))) ) elif i < j - 1: rate_matrix[ii, jj] = ( beta_microsat(i, s, lo) * alpha_microsat(i, u, v, lo) * (1 - p) * gamma_microsat(m, i, j, lo, hi) ) elif i == j + 1: rate_matrix[ii, jj] = ( beta_microsat(i, s, lo) * (1 - alpha_microsat(i, u, v, lo)) * (p + ((1 - p) * gamma_microsat(m, i, j, lo, hi))) ) elif i > j + 1: rate_matrix[ii, jj] = ( beta_microsat(i, s, lo) * (1 - alpha_microsat(i, u, v, lo)) * (1 - p) * gamma_microsat(m, i, j, lo, hi) ) else: rate_matrix[ii, jj] = 0 # scale by max row sum; then normalize to 1 row_sums = rate_matrix.sum(axis=1, dtype="float64") alpha = np.max(row_sums) rate_matrix /= alpha row_sums = rate_matrix.sum(axis=1, dtype="float64") # the max(0, *) is to avoid floating point error np.fill_diagonal(rate_matrix, np.fmax(0.0, 1.0 - row_sums)) return rate_matrix
[docs]class MicrosatMutationModel(MatrixMutationModel): """ General class for Microsatellite mutation models which parameterizes the transition matrix describing changes among alleles using five parameters according to the model of `Sainudiin et al. (2004) <https://doi.org/10.1534/genetics.103.022665>`_ .. seealso:: See the :ref:`sec_mutations_microsats` section for more details and examples. Concretely, mutation rates under this model are computed as follows. Let ``qij`` be the transition rate from equation (3) in Sainudiin et al, and let ``M`` be the largest row sum of the matrix whose i,j-th entry is ``qij`` and whose diagonals are zero. Then if the mutation rate passed to :func:`.sim_mutations` is ``m``, the mutation rate from allele ``i`` to allele ``j`` is ``m * qij / M``. Note that the values of ``lo`` and ``hi`` are in units of repeat number, and are unrelated to repeat length (e.g., the repeat itself can be dinucleotide, tri-nucleotide, etcetera). .. warning:: The default values for ``lo`` and ``hi`` are chosen arbitrarily. The default of ``lo=1`` is chosen so that all states might be represented, but please note that in empirical data, microsatellite loci are often ascertained based on the number of repeats observed. Please ensure that you set ``lo`` and ``hi`` to the appropriate values for your organism / locus of interest. :param float s: strength of length dependence on mutation rate. Defaults to 0, i.e., no length dependence. :param float u: Constant bias parameter; must be between 0 and 1. Defaults to 0.5, i.e., no bias. :param float v: Linear bias parameter. Defaults to 0, i.e., no bias. :param float p: Probability of a single step mutation. Defaults to 1. :param float m: 1 / (mean multistep mutation size). Defaults to 1. :param int lo: Lower bound on repeat number. Defaults to 2. :param int hi: Upper bound on repeat number (inclusive). Defaults to 50. :param list[float] root_distribution: An array of float values of length hi-lo+1 which should sum to 1, and give the probabilities used to choose the ancestral state of each microsatellite. Defaults to the stationary distribution of the model. """ def __init__( self, *, s=None, u=None, v=None, p=None, m=None, lo=None, hi=None, root_distribution=None, ): s = 0.0 if s is None else s u = 0.5 if u is None else u v = 0.0 if v is None else v p = 1.0 if p is None else p m = 1.0 if m is None else m lo = 1 if lo is None else lo hi = 50 if hi is None else hi params = [s, u, v, p, m, lo, hi] if any(not math.isfinite(x) for x in params): raise ValueError("All parameters must be finite") if any(not isinstance(x, numbers.Number) for x in params): raise ValueError("All parameters must be numeric") lo = int(lo) hi = int(hi) if lo > hi: raise ValueError("lo > hi") if lo < 0: raise ValueError("lo must be >= 0") if s <= -1 / (hi - lo + 1): raise ValueError("s must be > -1/(hi-lo+1)") if u <= 0 or u >= 1: raise ValueError("u must be between 0 and 1") alleles = [str(int(x)) for x in range(lo, hi + 1)] transition_matrix = general_microsat_rate_matrix(s, u, v, p, m, lo, hi) if root_distribution is None: # solve for stationary distribution S, U = np.linalg.eig(transition_matrix.T) U = np.real_if_close(U, tol=1) stationary = np.array(U[:, np.where(np.abs(S - 1.0) < 1e-8)[0][0]]) stationary = stationary / np.sum(stationary) root_distribution = stationary super().__init__(alleles, root_distribution, transition_matrix)
[docs]class SMM(MicrosatMutationModel): """ Stepwise mutation model (Ohta and Kimura, 1978), for microsatellite repeat number. This is a :class:`.MicrosatMutationModel` with alleles ``[lo, .. , hi]``, a root distribution (stationary distribution by default), and a transition matrix that allows only mutations that change the number of repeats by +/- 1. Concretely, if the mutation rate is ``m``, then an allele ``k`` mutates to ``k+1`` and ``k-1`` at rate ``m/2`` each, except that mutations above ``hi`` or below ``lo`` result in no change. :param int lo: Repeat number lower bound. See the documentation for :class:`.MicrosatMutationModel` for details. :param int hi: Repeat number upper bound. See the documentation for :class:`.MicrosatMutationModel` for details. :param list[float] root_distribution: See the documentation for :class:`.MicrosatMutationModel` for details. """ def __init__(self, lo=None, hi=None, *, root_distribution=None): super().__init__( s=0, u=0.5, v=0, p=0, m=1, lo=lo, hi=hi, root_distribution=root_distribution )
[docs]class TPM(MicrosatMutationModel): """ Two-phase mutation model of `DiRienzo et al. (1994) <https://doi.org/10.1073/pnas.91.8.3166>`_ This models evolution of microsatellite repeat number in a manner that allows for multi-step mutations in copy number of microsatellite repeats. This is parameterized by `p` the probability of a single step mutation, and `m` the success probability of the truncated gamma distribution describing the distribution of longer steps such that the mean multi-step mutation is length `1/m`. For this model both `p` and `m` need to be set to < 1.0. This is a :class:`.MicrosatMutationModel` with alleles ``[lo, .. , hi]``. For a precise definition of the mutation rates, see :class:`.MicrosatMutationModel` with ``s=0``, ``u=0.5``, and ``v=0``. See :class:`.MicrosatMutationModel` for further details of parameterization. :param float p: Probability of a single step mutation. :param float m: Success prob of truncated gamma distribution. :param int lo: Repeat number lower bound. See the documentation for :class:`.MicrosatMutationModel` for details. :param int hi: Repeat number upper bound. See the documentation for :class:`.MicrosatMutationModel` for details. :param list[float] root_distribution: See the documentation for :class:`.MicrosatMutationModel` for details. """ def __init__(self, *, p, m, lo=None, hi=None, root_distribution=None): if not (0 < p < 1.0): raise ValueError("p must be between 0 and 1") if not (0 < m < 1.0): raise ValueError("m must be between 0 and 1") super().__init__( s=0, u=0.5, v=0, p=p, m=m, lo=lo, hi=hi, root_distribution=root_distribution )
[docs]class EL2(MicrosatMutationModel): """ Equal rate, linear biased, two-phase mutation model of `Garza et al. (1995) <10.1093/oxfordjournals.molbev.a040239>`_ as parameterized by `Sainudiin et al. (2004) <https://doi.org/10.1534/genetics.103.022665>`_ This models evolution of microsatellite repeat number in a manner that allows for multi-step mutations in copy number. This is parameterized by `m` the success probability of the truncated gamma distribution describing the distribution of longer steps, such that the mean multi-step mutation is length `1/m`. `u` the constant bias parameter which determines bias (if any) in expansion or contraction of the repeat number, and `v` the linear bias parameter which determines mutation rate variation associated with repeat number. For this model `m` need to be set to < 1.0, and `u` needs to be set to between 0 and 1. and `v` can take any real, positive or negative value. See :class:`.MicrosatMutationModel` for details on the constant bias parameter `u` and a linear bias parameter `v`. This is a :class:`.MicrosatMutationModel` with alleles ``[lo, .. , hi]`` For a precise definition of the mutation rates, see :class:`.MicrosatMutationModel` with ``s=0``. :param float m: Success prob of truncated gamma distribution. :param float u: Constant bias parameter. :param float v: Linear bias parameter. :param int lo: Repeat number lower bound. See the documentation for :class:`.MicrosatMutationModel` for details. :param int hi: Repeat number upper bound. See the documentation for :class:`.MicrosatMutationModel` for details. :param list[float] root_distribution: See the documentation for :class:`.MicrosatMutationModel` for details. """ def __init__(self, *, m, u, v, lo=None, hi=None, root_distribution=None): if not (0 < m < 1.0): raise ValueError("m must be between 0 and 1") if not (0 < u < 1.0): raise ValueError("u must be between 0 and 1") super().__init__( s=0, u=u, v=v, p=0, m=m, lo=lo, hi=hi, root_distribution=root_distribution )
[docs]class JC69(MatrixMutationModel): """ Jukes-Cantor mutation model (Jukes and Cantor 1969). Based on the standard ACGT nucleotides as allelic states, this model assumes equal probabilities for ancestral state and equal probabilities for all possible transitions. This is a :class:`.MatrixMutationModel` with alleles ``["A", "C", "G", "T"]``, root distribution ``[1/4, 1/4, 1/4, 1/4]``, and default transition matrix ``[[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3], [1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]]``. It has no free parameters. If ``state_independent`` is True, then instead all entries of the transition matrix will be 1/4, so one-quarter of the mutations will be silent. See :ref:`sec_mutations_existing` for more details. Citation: *Jukes TH, Cantor CR (1969). Evolution of Protein Molecules. pp. 21-132.* :param bool state_independent: Whether mutations will be generated in a way independent of the previous allelic state. """ def __init__(self, state_independent=False): alleles = _ACGT_ALLELES root_distribution = [0.25, 0.25, 0.25, 0.25] transition_matrix = np.zeros((4, 4)) if state_independent: transition_matrix[:] = 1 / 4 else: transition_matrix[:] = 1 / 3 np.fill_diagonal(transition_matrix, 0) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class HKY(MatrixMutationModel): """ The Hasegawa, Kishino and Yano mutation model (Hasegawa et al. 1985). Based on the standard ACGT nucleotides as allelic states, this model allows different rates for transitions and transversions, and sets an equilibrium frequency for each nucleotide. In addition a custom ancestral frequency (``root_distribution``) can be specified. With ``kappa=1.0`` and the default values of the other arguments this model is equal to :class:`.JC69`. This model is similar to :class:`.F84` but with a differing parameterisation for ``kappa``. This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of transition to transversion mutation rates, and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide frequencies. If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is equal to :math:`Q_{ij}`, where :math:`\\mathbf{Q} = \\frac{\\mu}{M} \\times \\begin{bmatrix} \\cdot & \\pi_C & \\kappa \\pi_G & \\pi_T \\\\ \\pi_A & \\cdot & \\pi_G & \\kappa \\pi_T \\\\ \\kappa \\pi_A & \\pi_C & \\cdot & \\pi_T \\\\ \\pi_A & \\kappa \\pi_C & \\pi_G & \\cdot \\end{bmatrix}` Here :math:`M` is an overall scaling factor on the mutation rate that can be computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion, and :math:`q_{ij} = \\kappa \\pi_j` otherwise. Set :math:`q_{ii} = 0`, and then let :math:`M` be the largest row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, this implementation as a :class:`.MatrixMutationModel` has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`. Note also that :math:`\\kappa` is the ratio of *individual* mutation rates, not the ratio of *total* transition to transversion rates, which would be :math:`\\kappa/2`, as is used in some parameterisations. If ``state_independent`` is true, then the above is modified by setting :math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent only if :math:`\\kappa = 1` by adding additional silent mutations. See :ref:`sec_mutations_existing` for more details. Citation: *Hasegawa M, Kishino H, Yano T (1985). "Dating of the human-ape splitting by a molecular clock of mitochondrial DNA". Journal of Molecular Evolution. 22 (2): 160–74.* Note that this implementation has the root distribution as a separate parameter, although it defaults to the equilibrium distribution. If you set the root distribution to be different than the equilibrium distribution, then you have a nonequilibrium model, and you should make sure that's what you want. :param float kappa: Scaling parameter by which the transition matrix is modified for transitions (A<>G, C<>T). For example a value of ``0`` means that no transitions will occur, ``1`` makes the probability of transitions and transversions equal. :param list[float] equilibrium_frequencies: An array of float values of length 4 which should sum to 1. These values are used to determine equilibrium frequencies (long-term averages) of the alleles. Defaults to [0.25, 0.25, 0.25, 0.25], i.e., all nucleotides equally likely. :param list[float] root_distribution: An array of float values of length 4 which should sum to 1. These values are used to determine the ancestral state of each mutational site. Defaults to the value of ``equilibrium_frequencies``. :param bool state_independent: Whether to include a higher rate of silent mutations that makes the model closer to state-independent. """ def __init__( self, kappa, equilibrium_frequencies=None, root_distribution=None, state_independent=False, ): alleles = _ACGT_ALLELES if equilibrium_frequencies is None: equilibrium_frequencies = np.array( [0.25, 0.25, 0.25, 0.25], dtype="float64" ) else: equilibrium_frequencies = np.array(equilibrium_frequencies, dtype="float64") if root_distribution is None: root_distribution = equilibrium_frequencies.copy() transition_matrix = np.full((4, 4), 1.0) # positions in transition matrix for transitions transition_pos = ((0, 1, 2, 3), (2, 3, 0, 1)) transition_matrix[transition_pos] = kappa transition_matrix *= equilibrium_frequencies if state_independent: np.fill_diagonal(transition_matrix, kappa * equilibrium_frequencies) else: np.fill_diagonal(transition_matrix, 0.0) row_sums = transition_matrix.sum(axis=1, dtype="float64") transition_matrix /= max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal( transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix)) ) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class F84(MatrixMutationModel): """ The F84 mutation model (Felsenstein and Churchill, 1996). Based on the standard ACGT nucleotides as allelic states, this model takes into account transitions and transversions, and sets an equilibrium frequency for each nucleotide. In addition a custom ancestral frequency (``root_distribution``) can be specified. With ``kappa=1.0`` and the default values of the other arguments this model is equal to :class:`.JC69`. This model is similar to :class:`.HKY` but with a differing parameterisation for ``kappa``. This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of transition to transversion mutation rates, and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide frequencies. If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is equal to :math:`Q_{ij}`, where :math:`\\mathbf{Q} = \\frac{\\mu}{M} \\times \\begin{bmatrix} \\cdot & \\pi_C & \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_G & \\pi_T \\\\ \\pi_A & \\cdot & \\pi_G & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_T \\\\ \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_A & \\pi_C & \\cdot & \\pi_T \\\\ \\pi_A & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_C & \\pi_G & \\cdot \\end{bmatrix}` Here :math:`M` is an overall scaling factor on the mutation rate that can be computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion, :math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_A + \\pi_G)) \\pi_j` if i and j are A and G in some order, :math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_C + \\pi_T)) \\pi_j` if i and j are C and T in some order, and :math:`q_{ii} = 0`. Let :math:`M` be the largest row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, this implementation as a :class:`.MatrixMutationModel` has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`. If ``state_independent`` is true, then the above is modified by setting :math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent only if :math:`\\kappa = 1` by adding additional silent mutations. See :ref:`sec_mutations_existing` for more details. Citation: *Felsenstein J, Churchill GA (January 1996). "A Hidden Markov Model approach to variation among sites in rate of evolution". Molecular Biology and Evolution. 13 (1): 93–104.* Note that this implementation has the root distribution as a separate parameter, although it defaults to the equilibrium distribution. If you set the root distribution to be different than the equilibrium distribution, then you have a nonequilibrium model, and you should make sure that's what you want. :param float kappa: Scaling parameter by which the transition matrix is modified for transitions (A<>G, C<>T). Must be greater than or equal to the larger of :math:`(\\pi_A+\\pi_G)/(\\pi_C+\\pi_T)` and :math:`(\\pi_C+\\pi_T)/(\\pi_A+\\pi_G)`. :param list[float] equilibrium_frequencies: An array of float values of length 4 which should sum to 1. These values are used to determine equilibrium frequencies (long-term averages) of the alleles. Defaults to [0.25, 0.25, 0.25, 0.25], i.e all nucleotides equally likely. :param list[float] root_distribution: An array of float values of length 4 which should sum to 1. These values are used to determine the ancestral state of each mutational site. Defaults to the value of ``equilibrium_frequencies``. :param bool state_independent: Whether to include a higher rate of silent mutations that makes the model closer to state-independent. """ def __init__( self, kappa, equilibrium_frequencies=None, root_distribution=None, state_independent=False, ): alleles = _ACGT_ALLELES if equilibrium_frequencies is None: equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25] if root_distribution is None: root_distribution = equilibrium_frequencies.copy() transition_matrix = np.full((4, 4), 1.0, dtype="float64") # positions in transition matrix for transitions transition_pos_AG = ((0, 2), (2, 0)) transition_pos_CT = ((1, 3), (3, 1)) p_AG = equilibrium_frequencies[0] + equilibrium_frequencies[2] p_CT = equilibrium_frequencies[1] + equilibrium_frequencies[3] gamma = np.float64(1 + (kappa - 1) / np.array([p_AG, p_CT])) transition_matrix[transition_pos_AG] = gamma[0] transition_matrix[transition_pos_CT] = gamma[1] transition_matrix *= equilibrium_frequencies if state_independent: np.fill_diagonal( transition_matrix, equilibrium_frequencies * np.concatenate([gamma, gamma]), ) else: np.fill_diagonal(transition_matrix, 0.0) row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal( transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix)) ) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class GTR(MatrixMutationModel): """ The Generalised Time-Reversible nucleotide mutation model, a general parameterisation of a time-reversible mutation process (Tavaré et al. 1986). It allows specification of per-nucleotide equilibrium frequencies and equilibrium transition rates. This model is parameterised by the vector :math:`r` (``relative_rates``), and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide frequencies. The entries of ``relative_rates`` are, in this order, :math:`(r_{AC}, r_{AG}, r_{AT}, r_{CG}, r_{CT}, r_{GT})`. If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where :math:`\\mathbf{Q} = \\frac{\\mu}{M} \\times \\begin{bmatrix} \\cdot & r_{AC} \\pi_C & r_{AG} \\pi_G & r_{AT} \\pi_T \\\\ r_{AC} \\pi_A & \\cdot & r_{CG} \\pi_G & r_{CT} \\pi_T \\\\ r_{AG} \\pi_A & r_{CG} \\pi_C & \\cdot & r_{GT} \\pi_T \\\\ r_{AT} \\pi_A & r_{CT} \\pi_C & r_{GT} \\pi_G & \\cdot \\end{bmatrix}` Here :math:`M` is an overall scaling factor on the mutation rate that can be computed as follows: let :math:`q_{ij} = r_{ij} \\pi_j` and :math:`q_{ii} = 0`, and then let :math:`M` be the largest row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, this implementation as a :class:`.MatrixMutationModel` has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii}` chosen so rows sum to one. If ``state_independent`` is true, then the above is modified by setting :math:`q_{ii} = \\pi_i`. This makes the model completely state-independent only if all :math:`r_{ij} = 1` by adding additional silent mutations. See :ref:`sec_mutations_existing` for more details. Citation: *Tavaré S (1986). "Some Probabilistic and Statistical Problems in the Analysis of DNA Sequences". Lectures on Mathematics in the Life Sciences. 17: 57–86.* Note that this implementation has the root distribution as a separate parameter, although it defaults to the equilibrium distribution. If you set the root distribution to be different than the equilibrium distribution, then you have a nonequilibrium model, and you should make sure that's what you want. :param list[float] relative_rates: Controls the relative rates of mutation between each pair of nucleotides, in order "A<>C", "A<>G", "A<>T", "C<>G", "C<>T", and "G<>T". :param list[float] equilibrium_frequencies: The equilibrium base frequencies, a list of four probabilities that sum to one. (Default: equal frequencies.) :param list[float] root_distribution: An array of float values of length 4 which should sum to 1. These values are used to determine the ancestral state of each mutational site. Defaults to the value of ``equilibrium_frequencies``. :param bool state_independent: Whether to include a higher rate of silent mutations that makes the model closer to state-independent. """ def __init__( self, relative_rates, equilibrium_frequencies=None, root_distribution=None, state_independent=False, ): alleles = _ACGT_ALLELES assert len(relative_rates) == 6 if equilibrium_frequencies is None: equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25] if root_distribution is None: root_distribution = equilibrium_frequencies transition_matrix = np.zeros((4, 4)) # relative_rates: [A->C, A->G, A->T, C->G, C->T, G->T] tri_upper = np.triu_indices_from(transition_matrix, k=1) transition_matrix[tri_upper] = relative_rates transition_matrix += transition_matrix.T if state_independent: np.fill_diagonal(transition_matrix, 1.0) transition_matrix *= equilibrium_frequencies row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal(transition_matrix, np.diag(transition_matrix) + 1.0 - row_sums) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class BLOSUM62(MatrixMutationModel): """ The BLOSUM62 model of time-reversible amino acid mutation. This model has no free parameters. The model is parameterised by a 20-by-20 symmetric matrix of relative rates, :math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi` (for the precise order, see this model's `.alleles` attribute). If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where :math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,` where :math:`M` is an overall scaling factor chosen so that the largest row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one. In this model, :math:`M = 1.726203705809619`. This implementation as a :class:`.MatrixMutationModel` has transition matrix :math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one, and root distribution equal to the equilibrium frequencies, :math:`\\pi`, so the matrix :math:`B` can be recovered as follows: .. code-block:: python model = msprime.BLOSUM62() M = 1.726203705809619 B = M * model.transition_matrix / model.root_distribution Citation: The values of :math:`B` and :math:`\\pi` used here were copied from `Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_. The original citation is: *Henikoff, S., and J. G. Henikoff. 1992. PNAS USA 89:10915-10919,* and the numerical values were provided in: *Yu,Y.-K., Wootton, J.C. and Altschul, S.F. (2003) The compositional adjustment of amino acid substitution matrices. Proc. Natl Acad. Sci., USA, 100, 15688–15693.* """ def __init__(self): alleles = _AMINO_ACIDS num_alleles = len(alleles) root_distribution = [ 0.074, 0.052, 0.045, 0.054, 0.025, 0.034, 0.054, 0.074, 0.026, 0.068, 0.099, 0.058, 0.025, 0.047, 0.039, 0.057, 0.051, 0.013, 0.032, 0.073, ] relative_rates = [ 0.735790389698, 0.485391055466, 1.297446705134, 0.543161820899, 0.500964408555, 3.180100048216, 1.459995310470, 0.227826574209, 0.397358949897, 0.240836614802, 1.199705704602, 3.020833610064, 1.839216146992, 1.190945703396, 0.329801504630, 1.170949042800, 1.360574190420, 1.240488508640, 3.761625208368, 0.140748891814, 5.528919177928, 1.955883574960, 0.418763308518, 1.355872344485, 0.798473248968, 0.418203192284, 0.609846305383, 0.423579992176, 0.716241444998, 1.456141166336, 2.414501434208, 0.778142664022, 0.354058109831, 2.435341131140, 1.626891056982, 0.539859124954, 0.605899003687, 0.232036445142, 0.283017326278, 0.418555732462, 0.774894022794, 0.236202451204, 0.186848046932, 0.189296292376, 0.252718447885, 0.800016530518, 0.622711669692, 0.211888159615, 0.218131577594, 0.831842640142, 0.580737093181, 0.372625175087, 0.217721159236, 0.348072209797, 3.890963773304, 1.295201266783, 5.411115141489, 1.593137043457, 1.032447924952, 0.285078800906, 3.945277674515, 2.802427151679, 0.752042440303, 1.022507035889, 0.406193586642, 0.445570274261, 1.253758266664, 0.983692987457, 0.648441278787, 0.222621897958, 0.767688823480, 2.494896077113, 0.555415397470, 0.459436173579, 0.984311525359, 3.364797763104, 6.030559379572, 1.073061184332, 0.492964679748, 0.371644693209, 0.354861249223, 0.281730694207, 0.441337471187, 0.144356959750, 0.291409084165, 0.368166464453, 0.714533703928, 1.517359325954, 2.064839703237, 0.266924750511, 1.773855168830, 1.173275900924, 0.448133661718, 0.494887043702, 0.730628272998, 0.356008498769, 0.858570575674, 0.926563934846, 0.504086599527, 0.527007339151, 0.388355409206, 0.374555687471, 1.047383450722, 0.454123625103, 0.233597909629, 4.325092687057, 1.122783104210, 2.904101656456, 1.582754142065, 1.197188415094, 1.934870924596, 1.769893238937, 1.509326253224, 1.117029762910, 0.357544412460, 0.352969184527, 1.752165917819, 0.918723415746, 0.540027644824, 1.169129577716, 1.729178019485, 0.914665954563, 1.898173634533, 0.934187509431, 1.119831358516, 1.277480294596, 1.071097236007, 0.641436011405, 0.585407090225, 1.179091197260, 0.915259857694, 1.303875200799, 1.488548053722, 0.488206118793, 1.005451683149, 5.151556292270, 0.465839367725, 0.426382310122, 0.191482046247, 0.145345046279, 0.527664418872, 0.758653808642, 0.407635648938, 0.508358924638, 0.301248600780, 0.341985787540, 0.691474634600, 0.332243040634, 0.888101098152, 2.074324893497, 0.252214830027, 0.387925622098, 0.513128126891, 0.718206697586, 0.720517441216, 0.538222519037, 0.261422208965, 0.470237733696, 0.958989742850, 0.596719300346, 0.308055737035, 4.218953969389, 0.674617093228, 0.811245856323, 0.717993486900, 0.951682162246, 6.747260430801, 0.369405319355, 0.796751520761, 0.801010243199, 4.054419006558, 2.187774522005, 0.438388343772, 0.312858797993, 0.258129289418, 1.116352478606, 0.530785790125, 0.524253846338, 0.253340790190, 0.201555971750, 8.311839405458, 2.231405688913, 0.498138475304, 2.575850755315, 0.838119610178, 0.496908410676, 0.561925457442, 2.253074051176, 0.266508731426, 1.000000000000, ] transition_matrix = np.zeros((num_alleles, num_alleles)) tril = np.tril_indices(num_alleles, k=-1) transition_matrix[tril] = relative_rates transition_matrix += np.tril(transition_matrix).T transition_matrix *= root_distribution row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal(transition_matrix, 1.0 - row_sums) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class PAM(MatrixMutationModel): """ The PAM model of time-reversible amino acid mutation. This model has no free parameters. The model is parameterised by a 20-by-20 symmetric matrix of relative rates, :math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi` (for the precise order, see this model's `.alleles` attribute). If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where :math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,` where :math:`M` is an overall scaling factor chosen so that the largest row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one. In this model, :math:`M = 1.78314862248`. This implementation as a :class:`.MatrixMutationModel` has transition matrix :math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one, and root distribution equal to the equilibrium frequencies, :math:`\\pi`, so the matrix :math:`B` can be recovered as follows: .. code-block:: python model = msprime.PAM() M = 1.78314862248 B = M * model.transition_matrix / model.root_distribution Citation: The values of :math:`B` and :math:`\\pi` used here were copied from `Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_, and follow the "Dayhoff DCMut" model as described in *Kosiol, C., and Goldman, N. 2005. Different versions of the Dayhoff rate matrix. Molecular Biology and Evolution 22:193-199.* The original citation is: *Dayhoff, M.O., Schwartz, R.M., Orcutt, B.C. (1978). A model of evolutionary change in proteins. Atlas of Protein Sequence Structures, Vol. 5, Suppl. 3, National Biomedical Research Foundation, Washington DC, pp. 345-352.* """ def __init__(self): alleles = _AMINO_ACIDS num_alleles = len(alleles) root_distribution = [ 0.087127, 0.040904, 0.040432, 0.046872, 0.033474, 0.038255, 0.049530, 0.088612, 0.033619, 0.036886, 0.085357, 0.080481, 0.014753, 0.039772, 0.050680, 0.069577, 0.058542, 0.010494, 0.029916, 0.064717, ] relative_rates = [ 0.267828, 0.984474, 0.327059, 1.199805, 0.000000, 8.931515, 0.360016, 0.232374, 0.000000, 0.000000, 0.887753, 2.439939, 1.028509, 1.348551, 0.000000, 1.961167, 0.000000, 1.493409, 11.388659, 0.000000, 7.086022, 2.386111, 0.087791, 1.385352, 1.240981, 0.107278, 0.281581, 0.811907, 0.228116, 2.383148, 5.290024, 0.868241, 0.282729, 6.011613, 0.439469, 0.106802, 0.653416, 0.632629, 0.768024, 0.239248, 0.438074, 0.180393, 0.609526, 0.000000, 0.076981, 0.406431, 0.154924, 0.341113, 0.000000, 0.000000, 0.730772, 0.112880, 0.071514, 0.443504, 2.556685, 0.258635, 4.610124, 3.148371, 0.716913, 0.000000, 1.519078, 0.830078, 0.267683, 0.270475, 0.460857, 0.180629, 0.717840, 0.896321, 0.000000, 0.000000, 0.000000, 1.127499, 0.304803, 0.170372, 0.000000, 3.332732, 5.230115, 2.411739, 0.183641, 0.136906, 0.138503, 0.000000, 0.000000, 0.000000, 0.000000, 0.153478, 0.475927, 1.951951, 1.565160, 0.000000, 0.921860, 2.485920, 1.028313, 0.419244, 0.133940, 0.187550, 1.526188, 0.507003, 0.347153, 0.933709, 0.119152, 0.316258, 0.335419, 0.170205, 0.110506, 4.051870, 1.531590, 4.885892, 0.956097, 1.598356, 0.561828, 0.793999, 2.322243, 0.353643, 0.247955, 0.171432, 0.954557, 0.619951, 0.459901, 2.427202, 3.680365, 0.265745, 2.271697, 0.660930, 0.162366, 0.525651, 0.340156, 0.306662, 0.226333, 1.900739, 0.331090, 1.350599, 1.031534, 0.136655, 0.782857, 5.436674, 0.000000, 2.001375, 0.224968, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.270564, 0.000000, 0.461776, 0.000000, 0.000000, 0.762354, 0.000000, 0.740819, 0.000000, 0.244139, 0.078012, 0.946940, 0.000000, 0.953164, 0.000000, 0.214717, 0.000000, 1.265400, 0.374834, 0.286572, 0.132142, 0.000000, 6.952629, 0.000000, 0.336289, 0.417839, 0.608070, 2.059564, 0.240368, 0.158067, 0.178316, 0.484678, 0.346983, 0.367250, 0.538165, 0.438715, 8.810038, 1.745156, 0.103850, 2.565955, 0.123606, 0.485026, 0.303836, 1.561997, 0.000000, 0.279379, ] transition_matrix = np.zeros((num_alleles, num_alleles)) tril = np.tril_indices(num_alleles, k=-1) transition_matrix[tril] = relative_rates transition_matrix += np.tril(transition_matrix).T transition_matrix *= root_distribution row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal(transition_matrix, 1.0 - row_sums) super().__init__(alleles, root_distribution, transition_matrix)
# Pre 1.0, we had these constants to define the alphabet for the # simple infinite sites mutations. These should be deprecated and removed # along with the InfiniteSites class. BINARY = 0 NUCLEOTIDES = 1
[docs]class InfiniteSites(MatrixMutationModel): # This mutation model is defined for backwards compatibility, and is a remnant # of an earlier design. def __init__(self, alphabet=BINARY): self.alphabet = alphabet models = {BINARY: BinaryMutationModel(), NUCLEOTIDES: JC69()} if alphabet not in models: raise ValueError("Bad alphabet") model = models[alphabet] super().__init__( model.alleles, model.root_distribution, model.transition_matrix )
def _simple_mutate( *, tables, sequence_length, rate, random_generator, ): """ Generate mutations directly using the low-level interfaces. Only for internal use with the ms and old-style simulate() interfaces. """ rate_map = intervals.RateMap.uniform(sequence_length, rate) _msprime.sim_mutations( tables, random_generator, rate_map.asdict(), model=BinaryMutationModel(), discrete_genome=False, )
[docs]def mutate( tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. .. note:: This function is deprecated and we recommend new code use the :func:`sim_mutations` function instead, which is similar but offers more functionality and defaults to simulating mutations on a discrete genome. This function will be supported indefinitely, however. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.InfiniteSites` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time at which a mutation can occur (Default: no restriction). :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ # It's probably an error if someone tries to use a new-style model # with the old mutate() function, and so we raise an error if anyone # does, to help them move to the new interface. if model is not None and not isinstance(model, InfiniteSites): raise ValueError( "This is a legacy interface which does not support general " "mutation models. Please use the sim_ancestry function instead." ) # The legacy function simulates 0/1 alleles by default. if model is None: model = BinaryMutationModel() return sim_mutations( tree_sequence, rate=rate, random_seed=random_seed, model=model, keep=keep, start_time=start_time, end_time=end_time, discrete_genome=False, )
# String names here should match Seq-Gen where possible MODEL_MAP = { "infinite_alleles": InfiniteAlleles, "binary": BinaryMutationModel, "jc69": JC69, "blosum62": BLOSUM62, "pam": PAM, # "slim": SLiMMutationModel(), Needs type argument so can't be string init'd # "hky": HKY(), Needs kappa argument # "f84": F84(), Needs kappa argument # "gtr": GTR(), Needs relative_rates argument } def mutation_model_factory(model): """ Returns a mutation model corresponding to the specified model. - If model is None, the default mutation model is returned. - If model is a string, return the corresponding model instance. - If model is an instance of MutationModel, return it. - Otherwise raise a type error. """ if model is None: model_instance = JC69() elif isinstance(model, str): lower_model = model.lower() if lower_model not in MODEL_MAP: raise ValueError( "Model '{}' unknown. Choose from {}".format( model, sorted(MODEL_MAP.keys()) ) ) model_instance = MODEL_MAP[lower_model]() elif isinstance(model, MutationModel): model_instance = model else: raise TypeError( "Mutation model must be a string or an instance of MutationModel" ) return model_instance
[docs]def sim_mutations( tree_sequence, rate=None, *, random_seed=None, model=None, start_time=None, end_time=None, discrete_genome=None, keep=None, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate per unit of sequence length, per unit of time. By default, mutations are generated at discrete sites along the genome and multiple mutations can occur at any given site. A continuous sequence, infinite-sites model can also be specified by setting the ``discrete_genome`` parameter to False. If the ``model`` parameter is specified, this determines the model of sequence evolution under which mutations are generated. The default mutation model is the :class:`msprime.JC69`, a symmetrical mutation model among the ACGT alleles. See the :ref:`sec_mutations_models` section for details of available models. If a ``random_seed`` is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. If the tree sequence already has mutations, these are by default retained, but can be discarded by passing ``keep=False``. However, adding new mutations to a tree sequence with existing mutations must be done with caution, since it can lead to incorrect or nonsensical results if mutation probabilities differ by ancestral state. (As an extreme example, suppose that X->Y and X->Z are allowable transitions, but Y->Z is not. If a branch already has an X->Y mutation on it, then calling `sim_mutations(..., keep=True)` might insert an X->Z mutation above the existing mutation, thus implying the impossible chain X->Y->Z.) However, the effect on nucleotide models of mutation are generally very small. .. note:: Many mutation models will insert silent transitions (e.g., placing a mutation to A above an existing mutation to A). Such mutations are harmless and are required for us to guarantee the statistical properties of the process of sequentially adding mutations to a tree sequence. :param tskit.TreeSequence tree_sequence: The tree sequence we wish to throw mutations onto. :param float rate: The rate of mutation per unit of sequence length per unit time, as either a single number (for a uniform rate) or as a :class:`.RateMap`. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. See the :ref:`sec_randomness_seeds` section for usage examples. :param MutationModel model: The mutation model to use when generating mutations. This can either be a string (e.g., ``"jc69"``) or an instance of a simulation model class e.g, ``msprime.F84(kappa=0.5)``. If not specified or None, the :class:`.JC69` mutation model is used. Please see the :ref:`sec_mutations_models` section for more details on specifying mutation models. :param float start_time: The minimum time ago at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time ago at which a mutation can occur (Default: no restriction). :param bool discrete_genome: Whether to generate mutations at only integer positions along the genome (Default=True). :param bool keep: Whether to keep existing mutations. (default: True) :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") seed = random_seed if random_seed is None: seed = core.get_random_seed() else: seed = int(seed) parameters = dict( command="sim_mutations", tree_sequence=tree_sequence, rate=rate, model=model, start_time=start_time, end_time=end_time, discrete_genome=discrete_genome, keep=keep, random_seed=seed, ) encoded_provenance = provenance.json_encode_provenance( provenance.get_provenance_dict(parameters) ) if rate is None: rate = 0 try: rate = float(rate) rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate) except TypeError: rate_map = rate if not isinstance(rate_map, intervals.RateMap): raise TypeError("rate must be a float or a RateMap") if tables.time_units == tskit.TIME_UNITS_UNCALIBRATED: raise ValueError( "Simulating mutations doesn't make sense when time is uncalibrated" ) start_time = -sys.float_info.max if start_time is None else float(start_time) end_time = sys.float_info.max if end_time is None else float(end_time) if start_time > end_time: raise ValueError("start_time must be <= end_time") discrete_genome = core._parse_flag(discrete_genome, default=True) keep = core._parse_flag(keep, default=True) model = mutation_model_factory(model) rng = _msprime.RandomGenerator(seed) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) _msprime.sim_mutations( tables=lwt, random_generator=rng, rate_map=rate_map.asdict(), model=model, discrete_genome=discrete_genome, keep=keep, start_time=start_time, end_time=end_time, ) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(encoded_provenance) return tables.tree_sequence()