Source code for msprime.mutations

#
# Copyright (C) 2018-2021 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime.  If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for generating mutations on a given tree sequence.
"""
import inspect
import sys

import numpy as np
import tskit

from . import core
from . import intervals
from . import provenance
from msprime import _msprime

_ACGT_ALLELES = ["A", "C", "G", "T"]
_AMINO_ACIDS = [
    "A",
    "R",
    "N",
    "D",
    "C",
    "Q",
    "E",
    "G",
    "H",
    "I",
    "L",
    "K",
    "M",
    "F",
    "P",
    "S",
    "T",
    "W",
    "Y",
    "V",
]


[docs]class MutationModel: """ Abstract superclass of msprime mutation models. """ def asdict(self): # This version of asdict makes sure that we have sufficient parameters # to call the constructor and recreate the class. However, this means # that subclasses *must* have an instance variable of the same name as # each parameter. # This is essential for Provenance round-tripping to work. return { key: getattr(self, key) for key in inspect.signature(self.__init__).parameters.keys() if hasattr(self, key) }
[docs]class MatrixMutationModel(_msprime.MatrixMutationModel, MutationModel): """ Superclass of the specific mutation models with a finite set of states. This class can be used to create a matrix mutation model with custom alleles and probabilities. For details of how this works see :ref:`sec_mutations_matrix_mutation_models_details`. :param list[str] alleles: A list of the possible alleles generated by the model. Each entry is a string. :param list[float] root_distribution: An array of float values the same length as ``alleles`` which should sum to 1. These values are used to determine the ancestral state of each mutated site. For example if ``alleles`` is ``['A','B']`` and ``root_distribution`` is ``[1,0]`` all ancestral states will be ``'A'``. :param list[float] transition_matrix: A square 2d-matrix of transition probabilities where both dimensions are equal to the length of the ``alleles`` argument. These are used to determine the derived state at each mutation. Note that rows should sum to 1. """ def __str__(self): s = f"Mutation model with alleles {self.alleles}\n" s += " root distribution: {}\n".format( " ".join(map(str, self.root_distribution)) ) s += " transition matrix:\n" for row in self.transition_matrix: s += " {}\n".format(" ".join(map(str, row))) return s
[docs]class SLiMMutationModel(_msprime.SLiMMutationModel, MutationModel): """ An infinite-alleles model of mutation producing "SLiM-style" mutations. To agree with mutations produced by SLiM, the ancestral state of each new site is set to the empty string, and each derived state is produced by appending the "next allele" to the previous state. The result is that each allele is a comma-separated string of all mutations that have occurred up to the root. Mutations produced by SLiM carry both a ``time`` attribute, in units of "time ago" as usual for tskit, as well as a metadata attributed called "origin_generation", in units of time since the start of the simulation. Adding these two together - time since the start of the simulation plus time until the end - is equal to the total number of generations of the simulation. The origin_generation is not currently used by SLiM, but for consistency, the origin_generation attribute for mutations produced by this model is set equal to ``slim_generation`` minus ``floor(mut.time)``, where ``mut.time`` is the (tskit) time ago of the mutation. :param int type: The nonnegative integer defining the "type" of SLiM mutation that will be recorded in metadata. :param int next_id: The nonnegative integer to start assigning alleles from. (default: 0) :param int slim_generation: The "SLiM generation" time used in determining the "origin_generation" metadata attribute of mutations. This can usually be left at its default, which is 1. :param int block_size: The block size for allocating derived states. You do not need to change this unless you get an "out of memory" error due to a very large number of stacked mutations. """ def __str__(self): return ( f"Mutation model for SLiM mutations of type m{self.type}\n" f" next ID: {self.next_id}\n" )
# NOTE: we use a hacky workaround here for documenting the next_allele instance # variable because of the nasty sphinx bug: # https://github.com/sphinx-doc/sphinx/issues/2549 # This seems to be basically impossible to workaround in JupyterBook, so it # seems best to just avoid using ivars. The correct approach going forward # would appear to be to use dataclasses and to document each variable # independently (e.g., Population class).
[docs]class InfiniteAlleles(_msprime.InfiniteAllelesMutationModel, MutationModel): """ An *infinite alleles* model of mutation in which each allele is a unique positive integer. This works by keeping track of a "next allele": each time the model is asked to produce a new allele (either for an ancestral or derived state), the "next allele" is provided, and then incremented. **Variables:** **next_allele**: The *next* allele to be assigned. This increments by one each time an allele is assigned to a new mutation, and resets to ``start_allele`` each time a new ancestral state is assigned. :param int start_allele: The nonnegative integer to start assigning alleles from. (default: 0) """ def __str__(self): return ( f"Infinite alleles mutation model, beginning with allele" f" {self.start_allele}\n next allele: {self.next_allele}\n" )
[docs]class BinaryMutationModel(MatrixMutationModel): """ Basic binary mutation model with two alleles: ``"0"`` and ``"1"``, and ancestral allele always ``"0"``. This is a :class:`.MatrixMutationModel` with alleles ``["0", "1"]``, root distribution ``[1.0, 0.0]``, and transition matrix that is either ``[[0.0, 1.0], [1.0, 0.0]]`` (by default), or ``[[0.5, 0.5], [0.5, 0.5]]`` (if ``state_independent=True``). :param bool state_independent: Whether mutations will be generated in a way independent of the previous allelic state, which includes silent mutations. See :ref:`sec_mutations_existing` for more details. """ def __init__(self, state_independent=False): alleles = ["0", "1"] root_distribution = [1, 0] if state_independent: transition_matrix = [[0.5, 0.5], [0.5, 0.5]] else: transition_matrix = [[0.0, 1.0], [1.0, 0.0]] super().__init__(alleles, root_distribution, transition_matrix)
[docs]class JC69(MatrixMutationModel): """ Jukes-Cantor mutation model (Jukes and Cantor 1969). Based on the standard ACGT nucleotides as allelic states, this model assumes equal probabilities for ancestral state and equal probabilities for all possible transitions. This is a :class:`.MatrixMutationModel` with alleles ``["A", "C", "G", "T"]``, root distribution ``[1/4, 1/4, 1/4, 1/4]``, and default transition matrix ``[[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3], [1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]]``. It has no free parameters. If ``state_independent`` is True, then instead all entries of the transition matrix will be 1/4, so one-quarter of the mutations will be silent. See :ref:`sec_mutations_existing` for more details. Citation: *Jukes TH, Cantor CR (1969). Evolution of Protein Molecules. pp. 21-132.* :param bool state_independent: Whether mutations will be generated in a way independent of the previous allelic state. """ def __init__(self, state_independent=False): alleles = _ACGT_ALLELES root_distribution = [0.25, 0.25, 0.25, 0.25] transition_matrix = np.zeros((4, 4)) if state_independent: transition_matrix[:] = 1 / 4 else: transition_matrix[:] = 1 / 3 np.fill_diagonal(transition_matrix, 0) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class HKY(MatrixMutationModel): """ The Hasegawa, Kishino and Yano mutation model (Hasegawa et al. 1985). Based on the standard ACGT nucleotides as allelic states, this model allows different rates for transitions and transversions, and sets an equilibrium frequency for each nucleotide. In addition a custom ancestral frequency (``root_distribution``) can be specified. With ``kappa=1.0`` and the default values of the other arguments this model is equal to :class:`.JC69`. This model is similar to :class:`.F84` but with a differing parameterisation for ``kappa``. This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of transition to transversion mutation rates, and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide frequencies. If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is equal to :math:`Q_{ij}`, where :math:`\\mathbf{Q} = \\frac{\\mu}{M} \\times \\begin{bmatrix} \\cdot & \\pi_C & \\kappa \\pi_G & \\pi_T \\\\ \\pi_A & \\cdot & \\pi_G & \\kappa \\pi_T \\\\ \\kappa \\pi_A & \\pi_C & \\cdot & \\pi_T \\\\ \\pi_A & \\kappa \\pi_C & \\pi_G & \\cdot \\end{bmatrix}` Here :math:`M` is an overall scaling factor on the mutation rate that can be computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion, and :math:`q_{ij} = \\kappa \\pi_j` otherwise. Set :math:`q_{ii} = 0`, and then let :math:`M` be the largest row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, this implementation as a :class:`.MatrixMutationModel` has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`. Note also that :math:`\\kappa` is the ratio of *individual* mutation rates, not the ratio of *total* transition to transversion rates, which would be :math:`\\kappa/2`, as is used in some parameterisations. If ``state_independent`` is true, then the above is modified by setting :math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent only if :math:`\\kappa = 1` by adding additional silent mutations. See :ref:`sec_mutations_existing` for more details. Citation: *Hasegawa M, Kishino H, Yano T (1985). "Dating of the human-ape splitting by a molecular clock of mitochondrial DNA". Journal of Molecular Evolution. 22 (2): 160–74.* Note that this implementation has the root distribution as a separate parameter, although it defaults to the equilibrium distribution. If you set the root distribution to be different than the equilibrium distribution, then you have a nonequilibrium model, and you should make sure that's what you want. :param float kappa: Scaling parameter by which the transition matrix is modified for transitions (A<>G, C<>T). For example a value of ``0`` means that no transitions will occur, ``1`` makes the probability of transitions and transversions equal. :param list[float] equilibrium_frequencies: An array of float values of length 4 which should sum to 1. These values are used to determine equilibrium frequencies (long-term averages) of the alleles. Defaults to [0.25, 0.25, 0.25, 0.25], i.e., all nucleotides equally likely. :param list[float] root_distribution: An array of float values of length 4 which should sum to 1. These values are used to determine the ancestral state of each mutational site. Defaults to the value of ``equilibrium_frequencies``. :param bool state_independent: Whether to include a higher rate of silent mutations that makes the model closer to state-independent. """ def __init__( self, kappa, equilibrium_frequencies=None, root_distribution=None, state_independent=False, ): alleles = _ACGT_ALLELES if equilibrium_frequencies is None: equilibrium_frequencies = np.array( [0.25, 0.25, 0.25, 0.25], dtype="float64" ) else: equilibrium_frequencies = np.array(equilibrium_frequencies, dtype="float64") if root_distribution is None: root_distribution = equilibrium_frequencies.copy() transition_matrix = np.full((4, 4), 1.0) # positions in transition matrix for transitions transition_pos = ((0, 1, 2, 3), (2, 3, 0, 1)) transition_matrix[transition_pos] = kappa transition_matrix *= equilibrium_frequencies if state_independent: np.fill_diagonal(transition_matrix, kappa * equilibrium_frequencies) else: np.fill_diagonal(transition_matrix, 0.0) row_sums = transition_matrix.sum(axis=1, dtype="float64") transition_matrix /= max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal( transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix)) ) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class F84(MatrixMutationModel): """ The F84 mutation model (Felsenstein and Churchill, 1996). Based on the standard ACGT nucleotides as allelic states, this model takes into account transitions and transversions, and sets an equilibrium frequency for each nucleotide. In addition a custom ancestral frequency (``root_distribution``) can be specified. With ``kappa=1.0`` and the default values of the other arguments this model is equal to :class:`.JC69`. This model is similar to :class:`.HKY` but with a differing parameterisation for ``kappa``. This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of transition to transversion mutation rates, and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide frequencies. If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is equal to :math:`Q_{ij}`, where :math:`\\mathbf{Q} = \\frac{\\mu}{M} \\times \\begin{bmatrix} \\cdot & \\pi_C & \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_G & \\pi_T \\\\ \\pi_A & \\cdot & \\pi_G & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_T \\\\ \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_A & \\pi_C & \\cdot & \\pi_T \\\\ \\pi_A & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_C & \\pi_G & \\cdot \\end{bmatrix}` Here :math:`M` is an overall scaling factor on the mutation rate that can be computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion, :math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_A + \\pi_G)) \\pi_j` if i and j are A and G in some order, :math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_C + \\pi_T)) \\pi_j` if i and j are C and T in some order, and :math:`q_{ii} = 0`. Let :math:`M` be the largest row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, this implementation as a :class:`.MatrixMutationModel` has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`. If ``state_independent`` is true, then the above is modified by setting :math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent only if :math:`\\kappa = 1` by adding additional silent mutations. See :ref:`sec_mutations_existing` for more details. Citation: *Felsenstein J, Churchill GA (January 1996). "A Hidden Markov Model approach to variation among sites in rate of evolution". Molecular Biology and Evolution. 13 (1): 93–104.* Note that this implementation has the root distribution as a separate parameter, although it defaults to the equilibrium distribution. If you set the root distribution to be different than the equilibrium distribution, then you have a nonequilibrium model, and you should make sure that's what you want. :param float kappa: Scaling parameter by which the transition matrix is modified for transitions (A<>G, C<>T). Must be greater than or equal to the larger of :math:`(\\pi_A+\\pi_G)/(\\pi_C+\\pi_T)` and :math:`(\\pi_C+\\pi_T)/(\\pi_A+\\pi_G)`. :param list[float] equilibrium_frequencies: An array of float values of length 4 which should sum to 1. These values are used to determine equilibrium frequencies (long-term averages) of the alleles. Defaults to [0.25, 0.25, 0.25, 0.25], i.e all nucleotides equally likely. :param list[float] root_distribution: An array of float values of length 4 which should sum to 1. These values are used to determine the ancestral state of each mutational site. Defaults to the value of ``equilibrium_frequencies``. :param bool state_independent: Whether to include a higher rate of silent mutations that makes the model closer to state-independent. """ def __init__( self, kappa, equilibrium_frequencies=None, root_distribution=None, state_independent=False, ): alleles = _ACGT_ALLELES if equilibrium_frequencies is None: equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25] if root_distribution is None: root_distribution = equilibrium_frequencies.copy() transition_matrix = np.full((4, 4), 1.0, dtype="float64") # positions in transition matrix for transitions transition_pos_AG = ((0, 2), (2, 0)) transition_pos_CT = ((1, 3), (3, 1)) p_AG = equilibrium_frequencies[0] + equilibrium_frequencies[2] p_CT = equilibrium_frequencies[1] + equilibrium_frequencies[3] gamma = np.float64(1 + (kappa - 1) / np.array([p_AG, p_CT])) transition_matrix[transition_pos_AG] = gamma[0] transition_matrix[transition_pos_CT] = gamma[1] transition_matrix *= equilibrium_frequencies if state_independent: np.fill_diagonal( transition_matrix, equilibrium_frequencies * np.concatenate([gamma, gamma]), ) else: np.fill_diagonal(transition_matrix, 0.0) row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal( transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix)) ) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class GTR(MatrixMutationModel): """ The Generalised Time-Reversible nucleotide mutation model, a general parameterisation of a time-reversible mutation process (Tavaré et al. 1986). It allows specification of per-nucleotide equilibrium frequencies and equilibrium transition rates. This model is parameterised by the vector :math:`r` (``relative_rates``), and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide frequencies. The entries of ``relative_rates`` are, in this order, :math:`(r_{AC}, r_{AG}, r_{AT}, r_{CG}, r_{CT}, r_{GT})`. If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where :math:`\\mathbf{Q} = \\frac{\\mu}{M} \\times \\begin{bmatrix} \\cdot & r_{AC} \\pi_C & r_{AG} \\pi_G & r_{AT} \\pi_T \\\\ r_{AC} \\pi_A & \\cdot & r_{CG} \\pi_G & r_{CT} \\pi_T \\\\ r_{AG} \\pi_A & r_{CG} \\pi_C & \\cdot & r_{GT} \\pi_T \\\\ r_{AT} \\pi_A & r_{CT} \\pi_C & r_{GT} \\pi_G & \\cdot \\end{bmatrix}` Here :math:`M` is an overall scaling factor on the mutation rate that can be computed as follows: let :math:`q_{ij} = r_{ij} \\pi_j` and :math:`q_{ii} = 0`, and then let :math:`M` be the largest row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`. Then, this implementation as a :class:`.MatrixMutationModel` has the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii}` chosen so rows sum to one. If ``state_independent`` is true, then the above is modified by setting :math:`q_{ii} = \\pi_i`. This makes the model completely state-independent only if all :math:`r_{ij} = 1` by adding additional silent mutations. See :ref:`sec_mutations_existing` for more details. Citation: *Tavaré S (1986). "Some Probabilistic and Statistical Problems in the Analysis of DNA Sequences". Lectures on Mathematics in the Life Sciences. 17: 57–86.* Note that this implementation has the root distribution as a separate parameter, although it defaults to the equilibrium distribution. If you set the root distribution to be different than the equilibrium distribution, then you have a nonequilibrium model, and you should make sure that's what you want. :param list[float] relative_rates: Controls the relative rates of mutation between each pair of nucleotides, in order "A<>C", "A<>G", "A<>T", "C<>G", "C<>T", and "G<>T". :param list[float] equilibrium_frequencies: The equilibrium base frequencies, a list of four probabilities that sum to one. (Default: equal frequencies.) :param list[float] root_distribution: An array of float values of length 4 which should sum to 1. These values are used to determine the ancestral state of each mutational site. Defaults to the value of ``equilibrium_frequencies``. :param bool state_independent: Whether to include a higher rate of silent mutations that makes the model closer to state-independent. """ def __init__( self, relative_rates, equilibrium_frequencies=None, root_distribution=None, state_independent=False, ): alleles = _ACGT_ALLELES assert len(relative_rates) == 6 if equilibrium_frequencies is None: equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25] if root_distribution is None: root_distribution = equilibrium_frequencies transition_matrix = np.zeros((4, 4)) # relative_rates: [A->C, A->G, A->T, C->G, C->T, G->T] tri_upper = np.triu_indices_from(transition_matrix, k=1) transition_matrix[tri_upper] = relative_rates transition_matrix += transition_matrix.T if state_independent: np.fill_diagonal(transition_matrix, 1.0) transition_matrix *= equilibrium_frequencies row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal(transition_matrix, np.diag(transition_matrix) + 1.0 - row_sums) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class BLOSUM62(MatrixMutationModel): """ The BLOSUM62 model of time-reversible amino acid mutation. This model has no free parameters. The model is parameterised by a 20-by-20 symmetric matrix of relative rates, :math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi` (for the precise order, see this model's `.alleles` attribute). If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where :math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,` where :math:`M` is an overall scaling factor chosen so that the largest row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one. In this model, :math:`M = 1.726203705809619`. This implementation as a :class:`.MatrixMutationModel` has transition matrix :math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one, and root distribution equal to the equilibrium frequencies, :math:`\\pi`, so the matrix :math:`B` can be recovered as follows: .. code-block:: python model = msprime.BLOSUM62() M = 1.726203705809619 B = M * model.transition_matrix / model.root_distribution Citation: The values of :math:`B` and :math:`\\pi` used here were copied from `Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_. The original citation is: *Henikoff, S., and J. G. Henikoff. 1992. PNAS USA 89:10915-10919,* and the numerical values were provided in: *Yu,Y.-K., Wootton, J.C. and Altschul, S.F. (2003) The compositional adjustment of amino acid substitution matrices. Proc. Natl Acad. Sci., USA, 100, 15688–15693.* """ def __init__(self): alleles = _AMINO_ACIDS num_alleles = len(alleles) root_distribution = [ 0.074, 0.052, 0.045, 0.054, 0.025, 0.034, 0.054, 0.074, 0.026, 0.068, 0.099, 0.058, 0.025, 0.047, 0.039, 0.057, 0.051, 0.013, 0.032, 0.073, ] relative_rates = [ 0.735790389698, 0.485391055466, 1.297446705134, 0.543161820899, 0.500964408555, 3.180100048216, 1.459995310470, 0.227826574209, 0.397358949897, 0.240836614802, 1.199705704602, 3.020833610064, 1.839216146992, 1.190945703396, 0.329801504630, 1.170949042800, 1.360574190420, 1.240488508640, 3.761625208368, 0.140748891814, 5.528919177928, 1.955883574960, 0.418763308518, 1.355872344485, 0.798473248968, 0.418203192284, 0.609846305383, 0.423579992176, 0.716241444998, 1.456141166336, 2.414501434208, 0.778142664022, 0.354058109831, 2.435341131140, 1.626891056982, 0.539859124954, 0.605899003687, 0.232036445142, 0.283017326278, 0.418555732462, 0.774894022794, 0.236202451204, 0.186848046932, 0.189296292376, 0.252718447885, 0.800016530518, 0.622711669692, 0.211888159615, 0.218131577594, 0.831842640142, 0.580737093181, 0.372625175087, 0.217721159236, 0.348072209797, 3.890963773304, 1.295201266783, 5.411115141489, 1.593137043457, 1.032447924952, 0.285078800906, 3.945277674515, 2.802427151679, 0.752042440303, 1.022507035889, 0.406193586642, 0.445570274261, 1.253758266664, 0.983692987457, 0.648441278787, 0.222621897958, 0.767688823480, 2.494896077113, 0.555415397470, 0.459436173579, 0.984311525359, 3.364797763104, 6.030559379572, 1.073061184332, 0.492964679748, 0.371644693209, 0.354861249223, 0.281730694207, 0.441337471187, 0.144356959750, 0.291409084165, 0.368166464453, 0.714533703928, 1.517359325954, 2.064839703237, 0.266924750511, 1.773855168830, 1.173275900924, 0.448133661718, 0.494887043702, 0.730628272998, 0.356008498769, 0.858570575674, 0.926563934846, 0.504086599527, 0.527007339151, 0.388355409206, 0.374555687471, 1.047383450722, 0.454123625103, 0.233597909629, 4.325092687057, 1.122783104210, 2.904101656456, 1.582754142065, 1.197188415094, 1.934870924596, 1.769893238937, 1.509326253224, 1.117029762910, 0.357544412460, 0.352969184527, 1.752165917819, 0.918723415746, 0.540027644824, 1.169129577716, 1.729178019485, 0.914665954563, 1.898173634533, 0.934187509431, 1.119831358516, 1.277480294596, 1.071097236007, 0.641436011405, 0.585407090225, 1.179091197260, 0.915259857694, 1.303875200799, 1.488548053722, 0.488206118793, 1.005451683149, 5.151556292270, 0.465839367725, 0.426382310122, 0.191482046247, 0.145345046279, 0.527664418872, 0.758653808642, 0.407635648938, 0.508358924638, 0.301248600780, 0.341985787540, 0.691474634600, 0.332243040634, 0.888101098152, 2.074324893497, 0.252214830027, 0.387925622098, 0.513128126891, 0.718206697586, 0.720517441216, 0.538222519037, 0.261422208965, 0.470237733696, 0.958989742850, 0.596719300346, 0.308055737035, 4.218953969389, 0.674617093228, 0.811245856323, 0.717993486900, 0.951682162246, 6.747260430801, 0.369405319355, 0.796751520761, 0.801010243199, 4.054419006558, 2.187774522005, 0.438388343772, 0.312858797993, 0.258129289418, 1.116352478606, 0.530785790125, 0.524253846338, 0.253340790190, 0.201555971750, 8.311839405458, 2.231405688913, 0.498138475304, 2.575850755315, 0.838119610178, 0.496908410676, 0.561925457442, 2.253074051176, 0.266508731426, 1.000000000000, ] transition_matrix = np.zeros((num_alleles, num_alleles)) tril = np.tril_indices(num_alleles, k=-1) transition_matrix[tril] = relative_rates transition_matrix += np.tril(transition_matrix).T transition_matrix *= root_distribution row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal(transition_matrix, 1.0 - row_sums) super().__init__(alleles, root_distribution, transition_matrix)
[docs]class PAM(MatrixMutationModel): """ The PAM model of time-reversible amino acid mutation. This model has no free parameters. The model is parameterised by a 20-by-20 symmetric matrix of relative rates, :math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi` (for the precise order, see this model's `.alleles` attribute). If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where :math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,` where :math:`M` is an overall scaling factor chosen so that the largest row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one. In this model, :math:`M = 1.78314862248`. This implementation as a :class:`.MatrixMutationModel` has transition matrix :math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one, and root distribution equal to the equilibrium frequencies, :math:`\\pi`, so the matrix :math:`B` can be recovered as follows: .. code-block:: python model = msprime.PAM() M = 1.78314862248 B = M * model.transition_matrix / model.root_distribution Citation: The values of :math:`B` and :math:`\\pi` used here were copied from `Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_, and follow the "Dayhoff DCMut" model as described in *Kosiol, C., and Goldman, N. 2005. Different versions of the Dayhoff rate matrix. Molecular Biology and Evolution 22:193-199.* The original citation is: *Dayhoff, M.O., Schwartz, R.M., Orcutt, B.C. (1978). A model of evolutionary change in proteins. Atlas of Protein Sequence Structures, Vol. 5, Suppl. 3, National Biomedical Research Foundation, Washington DC, pp. 345-352.* """ def __init__(self): alleles = _AMINO_ACIDS num_alleles = len(alleles) root_distribution = [ 0.087127, 0.040904, 0.040432, 0.046872, 0.033474, 0.038255, 0.049530, 0.088612, 0.033619, 0.036886, 0.085357, 0.080481, 0.014753, 0.039772, 0.050680, 0.069577, 0.058542, 0.010494, 0.029916, 0.064717, ] relative_rates = [ 0.267828, 0.984474, 0.327059, 1.199805, 0.000000, 8.931515, 0.360016, 0.232374, 0.000000, 0.000000, 0.887753, 2.439939, 1.028509, 1.348551, 0.000000, 1.961167, 0.000000, 1.493409, 11.388659, 0.000000, 7.086022, 2.386111, 0.087791, 1.385352, 1.240981, 0.107278, 0.281581, 0.811907, 0.228116, 2.383148, 5.290024, 0.868241, 0.282729, 6.011613, 0.439469, 0.106802, 0.653416, 0.632629, 0.768024, 0.239248, 0.438074, 0.180393, 0.609526, 0.000000, 0.076981, 0.406431, 0.154924, 0.341113, 0.000000, 0.000000, 0.730772, 0.112880, 0.071514, 0.443504, 2.556685, 0.258635, 4.610124, 3.148371, 0.716913, 0.000000, 1.519078, 0.830078, 0.267683, 0.270475, 0.460857, 0.180629, 0.717840, 0.896321, 0.000000, 0.000000, 0.000000, 1.127499, 0.304803, 0.170372, 0.000000, 3.332732, 5.230115, 2.411739, 0.183641, 0.136906, 0.138503, 0.000000, 0.000000, 0.000000, 0.000000, 0.153478, 0.475927, 1.951951, 1.565160, 0.000000, 0.921860, 2.485920, 1.028313, 0.419244, 0.133940, 0.187550, 1.526188, 0.507003, 0.347153, 0.933709, 0.119152, 0.316258, 0.335419, 0.170205, 0.110506, 4.051870, 1.531590, 4.885892, 0.956097, 1.598356, 0.561828, 0.793999, 2.322243, 0.353643, 0.247955, 0.171432, 0.954557, 0.619951, 0.459901, 2.427202, 3.680365, 0.265745, 2.271697, 0.660930, 0.162366, 0.525651, 0.340156, 0.306662, 0.226333, 1.900739, 0.331090, 1.350599, 1.031534, 0.136655, 0.782857, 5.436674, 0.000000, 2.001375, 0.224968, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.270564, 0.000000, 0.461776, 0.000000, 0.000000, 0.762354, 0.000000, 0.740819, 0.000000, 0.244139, 0.078012, 0.946940, 0.000000, 0.953164, 0.000000, 0.214717, 0.000000, 1.265400, 0.374834, 0.286572, 0.132142, 0.000000, 6.952629, 0.000000, 0.336289, 0.417839, 0.608070, 2.059564, 0.240368, 0.158067, 0.178316, 0.484678, 0.346983, 0.367250, 0.538165, 0.438715, 8.810038, 1.745156, 0.103850, 2.565955, 0.123606, 0.485026, 0.303836, 1.561997, 0.000000, 0.279379, ] transition_matrix = np.zeros((num_alleles, num_alleles)) tril = np.tril_indices(num_alleles, k=-1) transition_matrix[tril] = relative_rates transition_matrix += np.tril(transition_matrix).T transition_matrix *= root_distribution row_sums = transition_matrix.sum(axis=1) transition_matrix = transition_matrix / max(row_sums) row_sums = transition_matrix.sum(axis=1, dtype="float64") np.fill_diagonal(transition_matrix, 1.0 - row_sums) super().__init__(alleles, root_distribution, transition_matrix)
# Pre 1.0, we had these constants to define the alphabet for the # simple infinite sites mutations. These should be deprecated and removed # along with the InfiniteSites class. BINARY = 0 NUCLEOTIDES = 1
[docs]class InfiniteSites(MatrixMutationModel): # This mutation model is defined for backwards compatibility, and is a remnant # of an earlier design. def __init__(self, alphabet=BINARY): self.alphabet = alphabet models = {BINARY: BinaryMutationModel(), NUCLEOTIDES: JC69()} if alphabet not in models: raise ValueError("Bad alphabet") model = models[alphabet] super().__init__( model.alleles, model.root_distribution, model.transition_matrix )
def _simple_mutate( *, tables, sequence_length, rate, random_generator, ): """ Generate mutations directly using the low-level interfaces. Only for internal use with the ms and old-style simulate() interfaces. """ rate_map = intervals.RateMap.uniform(sequence_length, rate) _msprime.sim_mutations( tables, random_generator, rate_map.asdict(), model=BinaryMutationModel(), discrete_genome=False, )
[docs]def mutate( tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. .. note:: This function is deprecated and we recommend new code use the :func:`sim_mutations` function instead, which is similar but offers more functionality and defaults to simulating mutations on a discrete genome. This function will be supported indefinitely, however. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.InfiniteSites` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time at which a mutation can occur (Default: no restriction). :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ # It's probably an error if someone tries to use a new-style model # with the old mutate() function, and so we raise an error if anyone # does, to help them move to the new interface. if model is not None and not isinstance(model, InfiniteSites): raise ValueError( "This is a legacy interface which does not support general " "mutation models. Please use the sim_ancestry function instead." ) # The legacy function simulates 0/1 alleles by default. if model is None: model = BinaryMutationModel() return sim_mutations( tree_sequence, rate=rate, random_seed=random_seed, model=model, keep=keep, start_time=start_time, end_time=end_time, discrete_genome=False, )
# String names here should match Seq-Gen where possible MODEL_MAP = { "infinite_alleles": InfiniteAlleles, "binary": BinaryMutationModel, "jc69": JC69, "blosum62": BLOSUM62, "pam": PAM, # "slim": SLiMMutationModel(), Needs type argument so can't be string init'd # "hky": HKY(), Needs kappa argument # "f84": F84(), Needs kappa argument # "gtr": GTR(), Needs relative_rates argument } def mutation_model_factory(model): """ Returns a mutation model corresponding to the specified model. - If model is None, the default mutation model is returned. - If model is a string, return the corresponding model instance. - If model is an instance of MutationModel, return it. - Otherwise raise a type error. """ if model is None: model_instance = JC69() elif isinstance(model, str): lower_model = model.lower() if lower_model not in MODEL_MAP: raise ValueError( "Model '{}' unknown. Choose from {}".format( model, sorted(MODEL_MAP.keys()) ) ) model_instance = MODEL_MAP[lower_model]() elif isinstance(model, MutationModel): model_instance = model else: raise TypeError( "Mutation model must be a string or an instance of MutationModel" ) return model_instance
[docs]def sim_mutations( tree_sequence, rate=None, *, random_seed=None, model=None, start_time=None, end_time=None, discrete_genome=None, keep=None, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate per unit of sequence length, per unit of time. By default, mutations are generated at discrete sites along the genome and multiple mutations can occur at any given site. A continuous sequence, infinite-sites model can also be specified by setting the ``discrete_genome`` parameter to False. If the ``model`` parameter is specified, this determines the model of sequence evolution under which mutations are generated. The default mutation model is the :class:`msprime.JC69`, a symmetrical mutation model among the ACGT alleles. See the :ref:`sec_mutations_models` section for details of available models. If a ``random_seed`` is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. If the tree sequence already has mutations, these are by default retained, but can be discarded by passing ``keep=False``. However, adding new mutations to a tree sequence with existing mutations must be done with caution, since it can lead to incorrect or nonsensical results if mutation probabilities differ by ancestral state. (As an extreme example, suppose that X->Y and X->Z are allowable transitions, but Y->Z is not. If a branch already has an X->Y mutation on it, then calling `sim_mutations(..., keep=True)` might insert an X->Z mutation above the existing mutation, thus implying the impossible chain X->Y->Z.) However, the effect on nucleotide models of mutation are generally very small. .. note:: Many mutation models will insert silent transitions (e.g., placing a mutation to A above an existing mutation to A). Such mutations are harmless and are required for us to guarantee the statistical properties of the process of sequentially adding mutations to a tree sequence. :param tskit.TreeSequence tree_sequence: The tree sequence we wish to throw mutations onto. :param float rate: The rate of mutation per unit of sequence length per unit time, as either a single number (for a uniform rate) or as a :class:`.RateMap`. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. See the :ref:`sec_randomness_seeds` section for usage examples. :param MutationModel model: The mutation model to use when generating mutations. This can either be a string (e.g., ``"jc69"``) or an instance of a simulation model class e.g, ``msprime.F84(kappa=0.5)``. If not specified or None, the :class:`.JC69` mutation model is used. Please see the :ref:`sec_mutations_models` section for more details on specifying mutation models. :param float start_time: The minimum time ago at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time ago at which a mutation can occur (Default: no restriction). :param bool discrete_genome: Whether to generate mutations at only integer positions along the genome (Default=True). :param bool keep: Whether to keep existing mutations. (default: True) :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") seed = random_seed if random_seed is None: seed = core.get_random_seed() else: seed = int(seed) if rate is None: rate = 0 try: rate = float(rate) rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate) except TypeError: rate_map = rate if not isinstance(rate_map, intervals.RateMap): raise TypeError("rate must be a float or a RateMap") start_time = -sys.float_info.max if start_time is None else float(start_time) end_time = sys.float_info.max if end_time is None else float(end_time) if start_time > end_time: raise ValueError("start_time must be <= end_time") discrete_genome = core._parse_flag(discrete_genome, default=True) keep = core._parse_flag(keep, default=True) model = mutation_model_factory(model) argspec = inspect.getargvalues(inspect.currentframe()) parameters = { "command": "sim_mutations", **{arg: argspec.locals[arg] for arg in argspec.args}, } parameters["random_seed"] = seed encoded_provenance = provenance.json_encode_provenance( provenance.get_provenance_dict(parameters) ) rng = _msprime.RandomGenerator(seed) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) _msprime.sim_mutations( tables=lwt, random_generator=rng, rate_map=rate_map.asdict(), model=model, discrete_genome=discrete_genome, keep=keep, start_time=start_time, end_time=end_time, ) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(encoded_provenance) return tables.tree_sequence()