# Source code for msprime.mutations

```
#
# Copyright (C) 2018-2021 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime. If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for generating mutations on a given tree sequence.
"""
import inspect
import sys
import numpy as np
import tskit
from . import core
from . import intervals
from . import provenance
from msprime import _msprime
_ACGT_ALLELES = ["A", "C", "G", "T"]
_AMINO_ACIDS = [
"A",
"R",
"N",
"D",
"C",
"Q",
"E",
"G",
"H",
"I",
"L",
"K",
"M",
"F",
"P",
"S",
"T",
"W",
"Y",
"V",
]
[docs]class MutationModel:
"""
Abstract superclass of msprime mutation models.
"""
def asdict(self):
# This version of asdict makes sure that we have sufficient parameters
# to call the constructor and recreate the class. However, this means
# that subclasses *must* have an instance variable of the same name as
# each parameter.
# This is essential for Provenance round-tripping to work.
return {
key: getattr(self, key)
for key in inspect.signature(self.__init__).parameters.keys()
if hasattr(self, key)
}
[docs]class MatrixMutationModel(_msprime.MatrixMutationModel, MutationModel):
"""
Superclass of the specific mutation models with a finite set of states.
This class can be used to create a matrix mutation model with custom
alleles and probabilities. For details of how this works see
:ref:`sec_mutations_matrix_mutation_models_details`.
:param list[str] alleles: A list of the possible alleles generated by the model. Each
entry is a string.
:param list[float] root_distribution: An array of float values the same length
as ``alleles`` which should sum to 1. These values are used to determine the
ancestral state of each mutated site. For example if ``alleles`` is
``['A','B']`` and ``root_distribution`` is ``[1,0]`` all ancestral states will be
``'A'``.
:param list[float] transition_matrix: A square 2d-matrix of transition
probabilities where both dimensions are equal to the length of the ``alleles``
argument. These are used to determine the derived state at each mutation. Note
that rows should sum to 1.
"""
def __str__(self):
s = f"Mutation model with alleles {self.alleles}\n"
s += " root distribution: {}\n".format(
" ".join(map(str, self.root_distribution))
)
s += " transition matrix:\n"
for row in self.transition_matrix:
s += " {}\n".format(" ".join(map(str, row)))
return s
[docs]class SLiMMutationModel(_msprime.SLiMMutationModel, MutationModel):
"""
An infinite-alleles model of mutation producing "SLiM-style" mutations.
To agree with mutations produced by SLiM, the ancestral state of each new
site is set to the empty string, and each derived state is produced by
appending the "next allele" to the previous state. The result is that each
allele is a comma-separated string of all mutations that have occurred up
to the root.
Mutations produced by SLiM carry both a ``time`` attribute, in units of "time ago"
as usual for tskit, as well as a metadata attributed called "origin_generation",
in units of time since the start of the simulation. Adding these two together -
time since the start of the simulation plus time until the end - is equal to the
total number of generations of the simulation. The origin_generation is not
currently used by SLiM, but for consistency, the origin_generation attribute for
mutations produced by this model is set equal to ``slim_generation`` minus
``floor(mut.time)``, where ``mut.time`` is the (tskit) time ago of the mutation.
:param int type: The nonnegative integer defining the "type" of SLiM mutation
that will be recorded in metadata.
:param int next_id: The nonnegative integer to start assigning alleles from.
(default: 0)
:param int slim_generation: The "SLiM generation" time used in determining the
"origin_generation" metadata attribute of mutations. This can usually
be left at its default, which is 1.
:param int block_size: The block size for allocating derived states.
You do not need to change this unless you get an "out of memory" error
due to a very large number of stacked mutations.
"""
def __str__(self):
return (
f"Mutation model for SLiM mutations of type m{self.type}\n"
f" next ID: {self.next_id}\n"
)
# NOTE: we use a hacky workaround here for documenting the next_allele instance
# variable because of the nasty sphinx bug:
# https://github.com/sphinx-doc/sphinx/issues/2549
# This seems to be basically impossible to workaround in JupyterBook, so it
# seems best to just avoid using ivars. The correct approach going forward
# would appear to be to use dataclasses and to document each variable
# independently (e.g., Population class).
[docs]class InfiniteAlleles(_msprime.InfiniteAllelesMutationModel, MutationModel):
"""
An *infinite alleles* model of mutation in which each allele is a
unique positive integer. This works by keeping track of a "next
allele": each time the model is asked to produce a new allele (either for an
ancestral or derived state), the "next allele" is provided, and then
incremented.
**Variables:**
**next_allele**: The *next* allele to be assigned. This increments by
one each time an allele is assigned to a new mutation, and resets to
``start_allele`` each time a new ancestral state is assigned.
:param int start_allele: The nonnegative integer to start assigning alleles from.
(default: 0)
"""
def __str__(self):
return (
f"Infinite alleles mutation model, beginning with allele"
f" {self.start_allele}\n next allele: {self.next_allele}\n"
)
[docs]class BinaryMutationModel(MatrixMutationModel):
"""
Basic binary mutation model with two alleles: ``"0"`` and ``"1"``, and ancestral
allele always ``"0"``.
This is a :class:`.MatrixMutationModel` with alleles ``["0", "1"]``,
root distribution ``[1.0, 0.0]``, and transition matrix that is either
``[[0.0, 1.0], [1.0, 0.0]]`` (by default), or
``[[0.5, 0.5], [0.5, 0.5]]`` (if ``state_independent=True``).
:param bool state_independent: Whether mutations will be generated in a way
independent of the previous allelic state, which includes silent mutations.
See :ref:`sec_mutations_existing` for more details.
"""
def __init__(self, state_independent=False):
alleles = ["0", "1"]
root_distribution = [1, 0]
if state_independent:
transition_matrix = [[0.5, 0.5], [0.5, 0.5]]
else:
transition_matrix = [[0.0, 1.0], [1.0, 0.0]]
super().__init__(alleles, root_distribution, transition_matrix)
[docs]class JC69(MatrixMutationModel):
"""
Jukes-Cantor mutation model (Jukes and Cantor 1969). Based on the standard ACGT
nucleotides as allelic states, this model assumes equal probabilities for
ancestral state and equal probabilities for all possible transitions.
This is a :class:`.MatrixMutationModel` with alleles ``["A", "C", "G", "T"]``,
root distribution ``[1/4, 1/4, 1/4, 1/4]``, and default transition matrix
``[[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3],
[1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]]``.
It has no free parameters.
If ``state_independent`` is True, then instead all entries of the
transition matrix will be 1/4, so one-quarter of the mutations will be
silent. See :ref:`sec_mutations_existing` for more details.
Citation: *Jukes TH, Cantor CR (1969). Evolution of Protein Molecules. pp. 21-132.*
:param bool state_independent: Whether mutations will be generated in a way
independent of the previous allelic state.
"""
def __init__(self, state_independent=False):
alleles = _ACGT_ALLELES
root_distribution = [0.25, 0.25, 0.25, 0.25]
transition_matrix = np.zeros((4, 4))
if state_independent:
transition_matrix[:] = 1 / 4
else:
transition_matrix[:] = 1 / 3
np.fill_diagonal(transition_matrix, 0)
super().__init__(alleles, root_distribution, transition_matrix)
[docs]class HKY(MatrixMutationModel):
"""
The Hasegawa, Kishino and Yano mutation model (Hasegawa et al. 1985). Based on the
standard ACGT nucleotides as allelic states, this model allows different rates for
transitions and transversions, and sets an equilibrium frequency for each nucleotide.
In addition a custom ancestral frequency (``root_distribution``) can be specified.
With ``kappa=1.0`` and the default values of the other arguments this model is equal
to :class:`.JC69`. This model is similar to :class:`.F84`
but with a differing parameterisation for ``kappa``.
This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of
transition to transversion mutation rates, and :math:`\\pi`
(``equilibrium_frequencies``), the vector of equilibrium nucleotide
frequencies. If this mutation model is used with a ``mutation_rate`` of
:math:`\\mu`, then the mutation rate from the i-th to the j-th allele is
equal to :math:`Q_{ij}`, where
:math:`\\mathbf{Q} =
\\frac{\\mu}{M} \\times
\\begin{bmatrix}
\\cdot & \\pi_C & \\kappa \\pi_G & \\pi_T \\\\
\\pi_A & \\cdot & \\pi_G & \\kappa \\pi_T \\\\
\\kappa \\pi_A & \\pi_C & \\cdot & \\pi_T \\\\
\\pi_A & \\kappa \\pi_C & \\pi_G & \\cdot
\\end{bmatrix}`
Here :math:`M` is an overall scaling factor on the mutation rate that can
be computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion,
and :math:`q_{ij} = \\kappa \\pi_j` otherwise. Set :math:`q_{ii} = 0`,
and then let :math:`M` be the largest row sum of :math:`q`, i.e.,
:math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
Then, this implementation as a :class:`.MatrixMutationModel` has the transition
matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`.
Note also that :math:`\\kappa` is the ratio of *individual* mutation rates,
not the ratio of *total* transition to transversion rates, which would
be :math:`\\kappa/2`, as is used in some parameterisations.
If ``state_independent`` is true, then the above is modified by setting
:math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent
only if :math:`\\kappa = 1` by adding additional silent mutations.
See :ref:`sec_mutations_existing` for more details.
Citation: *Hasegawa M, Kishino H, Yano T (1985). "Dating of the human-ape
splitting by a molecular clock of mitochondrial DNA". Journal of Molecular
Evolution. 22 (2): 160–74.*
Note that this implementation has the root distribution as a separate
parameter, although it defaults to the equilibrium distribution. If you
set the root distribution to be different than the equilibrium distribution,
then you have a nonequilibrium model, and you should make sure that's what
you want.
:param float kappa: Scaling parameter by which the transition matrix is modified for
transitions (A<>G, C<>T). For example a value of ``0`` means that no transitions
will occur, ``1`` makes the probability of transitions and
transversions equal.
:param list[float] equilibrium_frequencies: An array of float values of length
4 which should sum to 1. These values are used to determine equilibrium
frequencies (long-term averages) of the alleles. Defaults to
[0.25, 0.25, 0.25, 0.25], i.e., all nucleotides equally likely.
:param list[float] root_distribution: An array of float values of length 4
which should sum to 1. These values are used to determine the ancestral state of
each mutational site. Defaults to the value of ``equilibrium_frequencies``.
:param bool state_independent: Whether to include a higher rate of silent mutations
that makes the model closer to state-independent.
"""
def __init__(
self,
kappa,
equilibrium_frequencies=None,
root_distribution=None,
state_independent=False,
):
alleles = _ACGT_ALLELES
if equilibrium_frequencies is None:
equilibrium_frequencies = np.array(
[0.25, 0.25, 0.25, 0.25], dtype="float64"
)
else:
equilibrium_frequencies = np.array(equilibrium_frequencies, dtype="float64")
if root_distribution is None:
root_distribution = equilibrium_frequencies.copy()
transition_matrix = np.full((4, 4), 1.0)
# positions in transition matrix for transitions
transition_pos = ((0, 1, 2, 3), (2, 3, 0, 1))
transition_matrix[transition_pos] = kappa
transition_matrix *= equilibrium_frequencies
if state_independent:
np.fill_diagonal(transition_matrix, kappa * equilibrium_frequencies)
else:
np.fill_diagonal(transition_matrix, 0.0)
row_sums = transition_matrix.sum(axis=1, dtype="float64")
transition_matrix /= max(row_sums)
row_sums = transition_matrix.sum(axis=1, dtype="float64")
np.fill_diagonal(
transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix))
)
super().__init__(alleles, root_distribution, transition_matrix)
[docs]class F84(MatrixMutationModel):
"""
The F84 mutation model (Felsenstein and Churchill, 1996). Based on the
standard ACGT nucleotides as allelic states, this model takes into account
transitions and transversions, and sets an equilibrium frequency for each nucleotide.
In addition a custom ancestral frequency (``root_distribution``) can be specified.
With ``kappa=1.0`` and the default values of the other arguments this model is equal
to :class:`.JC69`. This model is similar to :class:`.HKY`
but with a differing parameterisation for ``kappa``.
This model is parameterised by :math:`\\kappa` (``kappa``), the ratio of
transition to transversion mutation rates, and :math:`\\pi`
(``equilibrium_frequencies``), the vector of equilibrium nucleotide
frequencies. If this mutation model is used with a ``mutation_rate`` of
:math:`\\mu`, then the mutation rate from the i-th to the j-th allele is
equal to :math:`Q_{ij}`, where
:math:`\\mathbf{Q} =
\\frac{\\mu}{M} \\times
\\begin{bmatrix}
\\cdot & \\pi_C
& \\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_G & \\pi_T \\\\
\\pi_A & \\cdot & \\pi_G
& \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_T \\\\
\\left(1 + \\frac{\\kappa-1}{\\pi_A + \\pi_G}\\right) \\pi_A & \\pi_C
& \\cdot & \\pi_T \\\\
\\pi_A & \\left(1 + \\frac{\\kappa-1}{\\pi_C + \\pi_T}\\right) \\pi_C
& \\pi_G & \\cdot
\\end{bmatrix}`
Here :math:`M` is an overall scaling factor on the mutation rate that can be
computed as follows: let :math:`q_{ij} = \\pi_j` if i<->j is a transversion,
:math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_A + \\pi_G)) \\pi_j` if i and j are
A and G in some order,
:math:`q_{ij} = (1 + (\\kappa-1)/(\\pi_C + \\pi_T)) \\pi_j` if i and j are
C and T in some order, and :math:`q_{ii} = 0`. Let :math:`M` be the largest
row sum of :math:`q`, i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
Then, this implementation as a :class:`.MatrixMutationModel` has the
transition matrix :math:`P_{ij} = q_{ij} / M`,
and :math:`P_{ii} = 1 - \\sum_{j\\neq i} P_{ij}`.
If ``state_independent`` is true, then the above is modified by setting
:math:`q_{ii} = \\kappa \\pi_i`. This makes the model completely state-independent
only if :math:`\\kappa = 1` by adding additional silent mutations.
See :ref:`sec_mutations_existing` for more details.
Citation: *Felsenstein J, Churchill GA (January 1996). "A Hidden Markov
Model approach to variation among sites in rate of evolution". Molecular
Biology and Evolution. 13 (1): 93–104.*
Note that this implementation has the root distribution as a separate
parameter, although it defaults to the equilibrium distribution. If you
set the root distribution to be different than the equilibrium distribution,
then you have a nonequilibrium model, and you should make sure that's what
you want.
:param float kappa: Scaling parameter by which the transition matrix is modified for
transitions (A<>G, C<>T). Must be greater than or equal to the larger of
:math:`(\\pi_A+\\pi_G)/(\\pi_C+\\pi_T)`
and :math:`(\\pi_C+\\pi_T)/(\\pi_A+\\pi_G)`.
:param list[float] equilibrium_frequencies: An array of float values of length
4 which should sum to 1. These values are used to determine equilibrium
frequencies (long-term averages) of the alleles. Defaults to
[0.25, 0.25, 0.25, 0.25], i.e all nucleotides equally likely.
:param list[float] root_distribution: An array of float values of length 4
which should sum to 1. These values are used to determine the ancestral state of
each mutational site. Defaults to the value of ``equilibrium_frequencies``.
:param bool state_independent: Whether to include a higher rate of silent mutations
that makes the model closer to state-independent.
"""
def __init__(
self,
kappa,
equilibrium_frequencies=None,
root_distribution=None,
state_independent=False,
):
alleles = _ACGT_ALLELES
if equilibrium_frequencies is None:
equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25]
if root_distribution is None:
root_distribution = equilibrium_frequencies.copy()
transition_matrix = np.full((4, 4), 1.0, dtype="float64")
# positions in transition matrix for transitions
transition_pos_AG = ((0, 2), (2, 0))
transition_pos_CT = ((1, 3), (3, 1))
p_AG = equilibrium_frequencies[0] + equilibrium_frequencies[2]
p_CT = equilibrium_frequencies[1] + equilibrium_frequencies[3]
gamma = np.float64(1 + (kappa - 1) / np.array([p_AG, p_CT]))
transition_matrix[transition_pos_AG] = gamma[0]
transition_matrix[transition_pos_CT] = gamma[1]
transition_matrix *= equilibrium_frequencies
if state_independent:
np.fill_diagonal(
transition_matrix,
equilibrium_frequencies * np.concatenate([gamma, gamma]),
)
else:
np.fill_diagonal(transition_matrix, 0.0)
row_sums = transition_matrix.sum(axis=1)
transition_matrix = transition_matrix / max(row_sums)
row_sums = transition_matrix.sum(axis=1, dtype="float64")
np.fill_diagonal(
transition_matrix, 1.0 - (row_sums - np.diag(transition_matrix))
)
super().__init__(alleles, root_distribution, transition_matrix)
[docs]class GTR(MatrixMutationModel):
"""
The Generalised Time-Reversible nucleotide mutation model, a general
parameterisation of a time-reversible mutation process (Tavaré et al.
1986). It allows specification of per-nucleotide equilibrium frequencies
and equilibrium transition rates.
This model is parameterised by the vector :math:`r` (``relative_rates``),
and :math:`\\pi` (``equilibrium_frequencies``), the vector of equilibrium nucleotide
frequencies. The entries of ``relative_rates`` are, in this order,
:math:`(r_{AC}, r_{AG}, r_{AT}, r_{CG}, r_{CT}, r_{GT})`. If this mutation model
is used with a ``mutation_rate`` of :math:`\\mu`, then the mutation rate from the
i-th to the j-th allele is :math:`Q_{ij}`, where
:math:`\\mathbf{Q} =
\\frac{\\mu}{M} \\times
\\begin{bmatrix}
\\cdot & r_{AC} \\pi_C & r_{AG} \\pi_G & r_{AT} \\pi_T \\\\
r_{AC} \\pi_A & \\cdot & r_{CG} \\pi_G & r_{CT} \\pi_T \\\\
r_{AG} \\pi_A & r_{CG} \\pi_C & \\cdot & r_{GT} \\pi_T \\\\
r_{AT} \\pi_A & r_{CT} \\pi_C & r_{GT} \\pi_G & \\cdot
\\end{bmatrix}`
Here :math:`M` is an overall scaling factor on the mutation rate that can be
computed as follows: let :math:`q_{ij} = r_{ij} \\pi_j`
and :math:`q_{ii} = 0`,
and then let :math:`M` be the largest row sum of :math:`q`,
i.e., :math:`M = \\max_i \\left( \\sum_j q_{ij} \\right)`.
Then, this implementation as a :class:`.MatrixMutationModel` has
the transition matrix :math:`P_{ij} = q_{ij} / M`, and :math:`P_{ii}`
chosen so rows sum to one.
If ``state_independent`` is true, then the above is modified by setting
:math:`q_{ii} = \\pi_i`. This makes the model completely state-independent
only if all :math:`r_{ij} = 1` by adding additional silent mutations.
See :ref:`sec_mutations_existing` for more details.
Citation: *Tavaré S (1986). "Some Probabilistic and Statistical Problems in the
Analysis of DNA Sequences". Lectures on Mathematics in the Life Sciences. 17:
57–86.*
Note that this implementation has the root distribution as a separate
parameter, although it defaults to the equilibrium distribution. If you
set the root distribution to be different than the equilibrium distribution,
then you have a nonequilibrium model, and you should make sure that's what
you want.
:param list[float] relative_rates: Controls the relative rates of
mutation between each pair of nucleotides, in order "A<>C", "A<>G", "A<>T",
"C<>G", "C<>T", and "G<>T".
:param list[float] equilibrium_frequencies: The equilibrium base
frequencies, a list of four probabilities that sum to one. (Default: equal
frequencies.)
:param list[float] root_distribution: An array of float values of length 4
which should sum to 1. These values are used to determine the ancestral state of
each mutational site. Defaults to the value of ``equilibrium_frequencies``.
:param bool state_independent: Whether to include a higher rate of silent mutations
that makes the model closer to state-independent.
"""
def __init__(
self,
relative_rates,
equilibrium_frequencies=None,
root_distribution=None,
state_independent=False,
):
alleles = _ACGT_ALLELES
assert len(relative_rates) == 6
if equilibrium_frequencies is None:
equilibrium_frequencies = [0.25, 0.25, 0.25, 0.25]
if root_distribution is None:
root_distribution = equilibrium_frequencies
transition_matrix = np.zeros((4, 4))
# relative_rates: [A->C, A->G, A->T, C->G, C->T, G->T]
tri_upper = np.triu_indices_from(transition_matrix, k=1)
transition_matrix[tri_upper] = relative_rates
transition_matrix += transition_matrix.T
if state_independent:
np.fill_diagonal(transition_matrix, 1.0)
transition_matrix *= equilibrium_frequencies
row_sums = transition_matrix.sum(axis=1)
transition_matrix = transition_matrix / max(row_sums)
row_sums = transition_matrix.sum(axis=1, dtype="float64")
np.fill_diagonal(transition_matrix, np.diag(transition_matrix) + 1.0 - row_sums)
super().__init__(alleles, root_distribution, transition_matrix)
[docs]class BLOSUM62(MatrixMutationModel):
"""
The BLOSUM62 model of time-reversible amino acid mutation. This model has
no free parameters.
The model is parameterised by a 20-by-20 symmetric matrix of relative rates,
:math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi`
(for the precise order, see this model's `.alleles` attribute).
If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then
the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where
:math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,`
where :math:`M` is an overall scaling factor chosen so that the largest
row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one.
In this model, :math:`M = 1.726203705809619`.
This implementation as a :class:`.MatrixMutationModel` has transition matrix
:math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one,
and root distribution equal to the equilibrium frequencies, :math:`\\pi`,
so the matrix :math:`B` can be recovered as follows:
.. code-block:: python
model = msprime.BLOSUM62()
M = 1.726203705809619
B = M * model.transition_matrix / model.root_distribution
Citation: The values of :math:`B` and :math:`\\pi` used here were copied from
`Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_.
The original citation is: *Henikoff, S., and J. G. Henikoff. 1992. PNAS USA
89:10915-10919,* and the numerical values were provided in: *Yu,Y.-K.,
Wootton, J.C. and Altschul, S.F. (2003) The compositional adjustment of amino
acid substitution matrices. Proc. Natl Acad. Sci., USA, 100, 15688–15693.*
"""
def __init__(self):
alleles = _AMINO_ACIDS
num_alleles = len(alleles)
root_distribution = [
0.074,
0.052,
0.045,
0.054,
0.025,
0.034,
0.054,
0.074,
0.026,
0.068,
0.099,
0.058,
0.025,
0.047,
0.039,
0.057,
0.051,
0.013,
0.032,
0.073,
]
relative_rates = [
0.735790389698,
0.485391055466,
1.297446705134,
0.543161820899,
0.500964408555,
3.180100048216,
1.459995310470,
0.227826574209,
0.397358949897,
0.240836614802,
1.199705704602,
3.020833610064,
1.839216146992,
1.190945703396,
0.329801504630,
1.170949042800,
1.360574190420,
1.240488508640,
3.761625208368,
0.140748891814,
5.528919177928,
1.955883574960,
0.418763308518,
1.355872344485,
0.798473248968,
0.418203192284,
0.609846305383,
0.423579992176,
0.716241444998,
1.456141166336,
2.414501434208,
0.778142664022,
0.354058109831,
2.435341131140,
1.626891056982,
0.539859124954,
0.605899003687,
0.232036445142,
0.283017326278,
0.418555732462,
0.774894022794,
0.236202451204,
0.186848046932,
0.189296292376,
0.252718447885,
0.800016530518,
0.622711669692,
0.211888159615,
0.218131577594,
0.831842640142,
0.580737093181,
0.372625175087,
0.217721159236,
0.348072209797,
3.890963773304,
1.295201266783,
5.411115141489,
1.593137043457,
1.032447924952,
0.285078800906,
3.945277674515,
2.802427151679,
0.752042440303,
1.022507035889,
0.406193586642,
0.445570274261,
1.253758266664,
0.983692987457,
0.648441278787,
0.222621897958,
0.767688823480,
2.494896077113,
0.555415397470,
0.459436173579,
0.984311525359,
3.364797763104,
6.030559379572,
1.073061184332,
0.492964679748,
0.371644693209,
0.354861249223,
0.281730694207,
0.441337471187,
0.144356959750,
0.291409084165,
0.368166464453,
0.714533703928,
1.517359325954,
2.064839703237,
0.266924750511,
1.773855168830,
1.173275900924,
0.448133661718,
0.494887043702,
0.730628272998,
0.356008498769,
0.858570575674,
0.926563934846,
0.504086599527,
0.527007339151,
0.388355409206,
0.374555687471,
1.047383450722,
0.454123625103,
0.233597909629,
4.325092687057,
1.122783104210,
2.904101656456,
1.582754142065,
1.197188415094,
1.934870924596,
1.769893238937,
1.509326253224,
1.117029762910,
0.357544412460,
0.352969184527,
1.752165917819,
0.918723415746,
0.540027644824,
1.169129577716,
1.729178019485,
0.914665954563,
1.898173634533,
0.934187509431,
1.119831358516,
1.277480294596,
1.071097236007,
0.641436011405,
0.585407090225,
1.179091197260,
0.915259857694,
1.303875200799,
1.488548053722,
0.488206118793,
1.005451683149,
5.151556292270,
0.465839367725,
0.426382310122,
0.191482046247,
0.145345046279,
0.527664418872,
0.758653808642,
0.407635648938,
0.508358924638,
0.301248600780,
0.341985787540,
0.691474634600,
0.332243040634,
0.888101098152,
2.074324893497,
0.252214830027,
0.387925622098,
0.513128126891,
0.718206697586,
0.720517441216,
0.538222519037,
0.261422208965,
0.470237733696,
0.958989742850,
0.596719300346,
0.308055737035,
4.218953969389,
0.674617093228,
0.811245856323,
0.717993486900,
0.951682162246,
6.747260430801,
0.369405319355,
0.796751520761,
0.801010243199,
4.054419006558,
2.187774522005,
0.438388343772,
0.312858797993,
0.258129289418,
1.116352478606,
0.530785790125,
0.524253846338,
0.253340790190,
0.201555971750,
8.311839405458,
2.231405688913,
0.498138475304,
2.575850755315,
0.838119610178,
0.496908410676,
0.561925457442,
2.253074051176,
0.266508731426,
1.000000000000,
]
transition_matrix = np.zeros((num_alleles, num_alleles))
tril = np.tril_indices(num_alleles, k=-1)
transition_matrix[tril] = relative_rates
transition_matrix += np.tril(transition_matrix).T
transition_matrix *= root_distribution
row_sums = transition_matrix.sum(axis=1)
transition_matrix = transition_matrix / max(row_sums)
row_sums = transition_matrix.sum(axis=1, dtype="float64")
np.fill_diagonal(transition_matrix, 1.0 - row_sums)
super().__init__(alleles, root_distribution, transition_matrix)
[docs]class PAM(MatrixMutationModel):
"""
The PAM model of time-reversible amino acid mutation. This model has no
free parameters.
The model is parameterised by a 20-by-20 symmetric matrix of relative rates,
:math:`B`, and a vector of amino acid equilibrium frequencies, :math:`\\pi`
(for the precise order, see this model's `.alleles` attribute).
If this mutation model is used with a ``mutation_rate`` of :math:`\\mu`, then
the mutation rate from the i-th to the j-th allele is :math:`Q_{ij}`, where
:math:`Q_{ij} = \\frac{\\mu}{M} B_{ij} \\pi_j,`
where :math:`M` is an overall scaling factor chosen so that the largest
row sum in the matrix :math:`B_{ij} \\pi_j /M` is equal to one.
In this model, :math:`M = 1.78314862248`.
This implementation as a :class:`.MatrixMutationModel` has transition matrix
:math:`P_{ij} = B_{ij} \\pi_{j} / M`, and :math:`P_{ii}` chosen so rows sum to one,
and root distribution equal to the equilibrium frequencies, :math:`\\pi`,
so the matrix :math:`B` can be recovered as follows:
.. code-block:: python
model = msprime.PAM()
M = 1.78314862248
B = M * model.transition_matrix / model.root_distribution
Citation: The values of :math:`B` and :math:`\\pi` used here were copied from
`Seq-Gen <http://tree.bio.ed.ac.uk/software/seqgen/>`_,
and follow the "Dayhoff DCMut" model as described in *Kosiol, C., and Goldman,
N. 2005. Different versions of the Dayhoff rate matrix. Molecular Biology and
Evolution 22:193-199.* The original citation is: *Dayhoff, M.O., Schwartz,
R.M., Orcutt, B.C. (1978). A model of evolutionary change in proteins. Atlas of
Protein Sequence Structures, Vol. 5, Suppl. 3, National Biomedical Research
Foundation, Washington DC, pp. 345-352.*
"""
def __init__(self):
alleles = _AMINO_ACIDS
num_alleles = len(alleles)
root_distribution = [
0.087127,
0.040904,
0.040432,
0.046872,
0.033474,
0.038255,
0.049530,
0.088612,
0.033619,
0.036886,
0.085357,
0.080481,
0.014753,
0.039772,
0.050680,
0.069577,
0.058542,
0.010494,
0.029916,
0.064717,
]
relative_rates = [
0.267828,
0.984474,
0.327059,
1.199805,
0.000000,
8.931515,
0.360016,
0.232374,
0.000000,
0.000000,
0.887753,
2.439939,
1.028509,
1.348551,
0.000000,
1.961167,
0.000000,
1.493409,
11.388659,
0.000000,
7.086022,
2.386111,
0.087791,
1.385352,
1.240981,
0.107278,
0.281581,
0.811907,
0.228116,
2.383148,
5.290024,
0.868241,
0.282729,
6.011613,
0.439469,
0.106802,
0.653416,
0.632629,
0.768024,
0.239248,
0.438074,
0.180393,
0.609526,
0.000000,
0.076981,
0.406431,
0.154924,
0.341113,
0.000000,
0.000000,
0.730772,
0.112880,
0.071514,
0.443504,
2.556685,
0.258635,
4.610124,
3.148371,
0.716913,
0.000000,
1.519078,
0.830078,
0.267683,
0.270475,
0.460857,
0.180629,
0.717840,
0.896321,
0.000000,
0.000000,
0.000000,
1.127499,
0.304803,
0.170372,
0.000000,
3.332732,
5.230115,
2.411739,
0.183641,
0.136906,
0.138503,
0.000000,
0.000000,
0.000000,
0.000000,
0.153478,
0.475927,
1.951951,
1.565160,
0.000000,
0.921860,
2.485920,
1.028313,
0.419244,
0.133940,
0.187550,
1.526188,
0.507003,
0.347153,
0.933709,
0.119152,
0.316258,
0.335419,
0.170205,
0.110506,
4.051870,
1.531590,
4.885892,
0.956097,
1.598356,
0.561828,
0.793999,
2.322243,
0.353643,
0.247955,
0.171432,
0.954557,
0.619951,
0.459901,
2.427202,
3.680365,
0.265745,
2.271697,
0.660930,
0.162366,
0.525651,
0.340156,
0.306662,
0.226333,
1.900739,
0.331090,
1.350599,
1.031534,
0.136655,
0.782857,
5.436674,
0.000000,
2.001375,
0.224968,
0.000000,
0.000000,
0.000000,
0.000000,
0.000000,
0.270564,
0.000000,
0.461776,
0.000000,
0.000000,
0.762354,
0.000000,
0.740819,
0.000000,
0.244139,
0.078012,
0.946940,
0.000000,
0.953164,
0.000000,
0.214717,
0.000000,
1.265400,
0.374834,
0.286572,
0.132142,
0.000000,
6.952629,
0.000000,
0.336289,
0.417839,
0.608070,
2.059564,
0.240368,
0.158067,
0.178316,
0.484678,
0.346983,
0.367250,
0.538165,
0.438715,
8.810038,
1.745156,
0.103850,
2.565955,
0.123606,
0.485026,
0.303836,
1.561997,
0.000000,
0.279379,
]
transition_matrix = np.zeros((num_alleles, num_alleles))
tril = np.tril_indices(num_alleles, k=-1)
transition_matrix[tril] = relative_rates
transition_matrix += np.tril(transition_matrix).T
transition_matrix *= root_distribution
row_sums = transition_matrix.sum(axis=1)
transition_matrix = transition_matrix / max(row_sums)
row_sums = transition_matrix.sum(axis=1, dtype="float64")
np.fill_diagonal(transition_matrix, 1.0 - row_sums)
super().__init__(alleles, root_distribution, transition_matrix)
# Pre 1.0, we had these constants to define the alphabet for the
# simple infinite sites mutations. These should be deprecated and removed
# along with the InfiniteSites class.
BINARY = 0
NUCLEOTIDES = 1
[docs]class InfiniteSites(MatrixMutationModel):
# This mutation model is defined for backwards compatibility, and is a remnant
# of an earlier design.
def __init__(self, alphabet=BINARY):
self.alphabet = alphabet
models = {BINARY: BinaryMutationModel(), NUCLEOTIDES: JC69()}
if alphabet not in models:
raise ValueError("Bad alphabet")
model = models[alphabet]
super().__init__(
model.alleles, model.root_distribution, model.transition_matrix
)
def _simple_mutate(
*,
tables,
sequence_length,
rate,
random_generator,
):
"""
Generate mutations directly using the low-level interfaces. Only
for internal use with the ms and old-style simulate() interfaces.
"""
rate_map = intervals.RateMap.uniform(sequence_length, rate)
_msprime.sim_mutations(
tables,
random_generator,
rate_map.asdict(),
model=BinaryMutationModel(),
discrete_genome=False,
)
[docs]def mutate(
tree_sequence,
rate=None,
random_seed=None,
model=None,
keep=False,
start_time=None,
end_time=None,
):
"""
Simulates mutations on the specified ancestry and returns the resulting
:class:`tskit.TreeSequence`. Mutations are generated at the specified rate in
measured generations. Mutations are generated under the infinite sites
model, and so the rate of new mutations is per unit of sequence length per
generation.
.. note:: This function is deprecated and we recommend new code use
the :func:`sim_mutations` function instead, which is similar
but offers more functionality and defaults to simulating mutations
on a discrete genome. This function will be supported indefinitely,
however.
If a random seed is specified, this is used to seed the random number
generator. If the same seed is specified and all other parameters are equal
then the same mutations will be generated. If no random seed is specified
then one is generated automatically.
If the ``model`` parameter is specified, this determines the model under
which mutations are generated. By default mutations from the
infinite sites model with a binary alphabet are generated.
By default, sites and mutations in the parameter tree sequence are
discarded. If the ``keep`` parameter is true, however, *additional*
mutations are simulated. Under the infinite sites mutation model, all new
mutations generated will occur at distinct positions from each other and
from any existing mutations (by rejection sampling).
The time interval over which mutations can occur may be controlled
using the ``start_time`` and ``end_time`` parameters. The ``start_time``
defines the lower bound (in time-ago) on this interval and ``max_time``
the upper bound. Note that we may have mutations associated with
nodes with time <= ``start_time`` since mutations store the node at the
bottom (i.e., towards the leaves) of the branch that they occur on.
:param tskit.TreeSequence tree_sequence: The tree sequence onto which we
wish to throw mutations.
:param float rate: The rate of mutation per generation. (Default: 0).
:param int random_seed: The random seed. If this is `None`, a
random seed will be automatically generated. Valid random
seeds must be between 1 and :math:`2^{32} - 1`.
:param MutationModel model: The mutation model to use when generating
mutations. If not specified or None, the :class:`.InfiniteSites`
mutation model is used.
:param bool keep: Whether to keep existing mutations (default: False).
:param float start_time: The minimum time at which a mutation can
occur. (Default: no restriction.)
:param float end_time: The maximum time at which a mutation can occur
(Default: no restriction).
:return: The :class:`tskit.TreeSequence` object resulting from overlaying
mutations on the input tree sequence.
:rtype: :class:`tskit.TreeSequence`
"""
# It's probably an error if someone tries to use a new-style model
# with the old mutate() function, and so we raise an error if anyone
# does, to help them move to the new interface.
if model is not None and not isinstance(model, InfiniteSites):
raise ValueError(
"This is a legacy interface which does not support general "
"mutation models. Please use the sim_ancestry function instead."
)
# The legacy function simulates 0/1 alleles by default.
if model is None:
model = BinaryMutationModel()
return sim_mutations(
tree_sequence,
rate=rate,
random_seed=random_seed,
model=model,
keep=keep,
start_time=start_time,
end_time=end_time,
discrete_genome=False,
)
# String names here should match Seq-Gen where possible
MODEL_MAP = {
"infinite_alleles": InfiniteAlleles,
"binary": BinaryMutationModel,
"jc69": JC69,
"blosum62": BLOSUM62,
"pam": PAM,
# "slim": SLiMMutationModel(), Needs type argument so can't be string init'd
# "hky": HKY(), Needs kappa argument
# "f84": F84(), Needs kappa argument
# "gtr": GTR(), Needs relative_rates argument
}
def mutation_model_factory(model):
"""
Returns a mutation model corresponding to the specified model.
- If model is None, the default mutation model is returned.
- If model is a string, return the corresponding model instance.
- If model is an instance of MutationModel, return it.
- Otherwise raise a type error.
"""
if model is None:
model_instance = JC69()
elif isinstance(model, str):
lower_model = model.lower()
if lower_model not in MODEL_MAP:
raise ValueError(
"Model '{}' unknown. Choose from {}".format(
model, sorted(MODEL_MAP.keys())
)
)
model_instance = MODEL_MAP[lower_model]()
elif isinstance(model, MutationModel):
model_instance = model
else:
raise TypeError(
"Mutation model must be a string or an instance of MutationModel"
)
return model_instance
[docs]def sim_mutations(
tree_sequence,
rate=None,
*,
random_seed=None,
model=None,
start_time=None,
end_time=None,
discrete_genome=None,
keep=None,
):
"""
Simulates mutations on the specified ancestry and returns the resulting
:class:`tskit.TreeSequence`. Mutations are generated at the specified rate
per unit of sequence length, per unit of time. By default, mutations are
generated at discrete sites along the genome and multiple mutations
can occur at any given site. A continuous sequence, infinite-sites model
can also be specified by setting the ``discrete_genome`` parameter to
False.
If the ``model`` parameter is specified, this determines the model
of sequence evolution under which mutations are generated.
The default mutation model is the :class:`msprime.JC69`,
a symmetrical mutation model among the ACGT alleles.
See the :ref:`sec_mutations_models` section for details of available models.
If a ``random_seed`` is specified, this is used to seed the random number
generator. If the same seed is specified and all other parameters are equal
then the same mutations will be generated. If no random seed is specified
then one is generated automatically.
The time interval over which mutations can occur may be controlled
using the ``start_time`` and ``end_time`` parameters. The ``start_time``
defines the lower bound (in time-ago) on this interval and ``max_time``
the upper bound. Note that we may have mutations associated with
nodes with time <= ``start_time`` since mutations store the node at the
bottom (i.e., towards the leaves) of the branch that they occur on.
If the tree sequence already has mutations, these are by default retained,
but can be discarded by passing ``keep=False``. However, adding new
mutations to a tree sequence with existing mutations must be done with
caution, since it can lead to incorrect or nonsensical results if mutation
probabilities differ by ancestral state. (As an extreme example, suppose
that X->Y and X->Z are allowable transitions, but Y->Z is not. If a branch
already has an X->Y mutation on it, then calling `sim_mutations(...,
keep=True)` might insert an X->Z mutation above the existing mutation, thus
implying the impossible chain X->Y->Z.) However, the effect on nucleotide
models of mutation are generally very small.
.. note:: Many mutation models will insert silent transitions (e.g.,
placing a mutation to A above an existing mutation to A). Such mutations
are harmless and are required for us to guarantee the statistical
properties of the process of sequentially adding mutations to a tree
sequence.
:param tskit.TreeSequence tree_sequence: The tree sequence we
wish to throw mutations onto.
:param float rate: The rate of mutation per unit of sequence length
per unit time, as either a single number (for a uniform rate) or as a
:class:`.RateMap`. (Default: 0).
:param int random_seed: The random seed. If this is `None`, a
random seed will be automatically generated. Valid random
seeds must be between 1 and :math:`2^{32} - 1`.
See the :ref:`sec_randomness_seeds` section for usage examples.
:param MutationModel model: The mutation model to use when generating
mutations. This can either be a string (e.g., ``"jc69"``) or
an instance of a simulation model class
e.g, ``msprime.F84(kappa=0.5)``.
If not specified or None, the :class:`.JC69`
mutation model is used. Please see the
:ref:`sec_mutations_models` section for more details
on specifying mutation models.
:param float start_time: The minimum time ago at which a mutation can
occur. (Default: no restriction.)
:param float end_time: The maximum time ago at which a mutation can occur
(Default: no restriction).
:param bool discrete_genome: Whether to generate mutations at only integer positions
along the genome (Default=True).
:param bool keep: Whether to keep existing mutations. (default: True)
:return: The :class:`tskit.TreeSequence` object resulting from overlaying
mutations on the input tree sequence.
:rtype: :class:`tskit.TreeSequence`
"""
try:
tables = tree_sequence.tables
except AttributeError:
raise ValueError("First argument must be a TreeSequence instance.")
seed = random_seed
if random_seed is None:
seed = core.get_random_seed()
else:
seed = int(seed)
parameters = dict(
command="sim_mutations",
tree_sequence=tree_sequence,
rate=rate,
model=model,
start_time=start_time,
end_time=end_time,
discrete_genome=discrete_genome,
keep=keep,
random_seed=seed,
)
encoded_provenance = provenance.json_encode_provenance(
provenance.get_provenance_dict(parameters)
)
if rate is None:
rate = 0
try:
rate = float(rate)
rate_map = intervals.RateMap.uniform(tree_sequence.sequence_length, rate)
except TypeError:
rate_map = rate
if not isinstance(rate_map, intervals.RateMap):
raise TypeError("rate must be a float or a RateMap")
if tables.time_units == tskit.TIME_UNITS_UNCALIBRATED:
raise ValueError(
"Simulating mutations doesn't make sense when time is uncalibrated"
)
start_time = -sys.float_info.max if start_time is None else float(start_time)
end_time = sys.float_info.max if end_time is None else float(end_time)
if start_time > end_time:
raise ValueError("start_time must be <= end_time")
discrete_genome = core._parse_flag(discrete_genome, default=True)
keep = core._parse_flag(keep, default=True)
model = mutation_model_factory(model)
rng = _msprime.RandomGenerator(seed)
lwt = _msprime.LightweightTableCollection()
lwt.fromdict(tables.asdict())
_msprime.sim_mutations(
tables=lwt,
random_generator=rng,
rate_map=rate_map.asdict(),
model=model,
discrete_genome=discrete_genome,
keep=keep,
start_time=start_time,
end_time=end_time,
)
tables = tskit.TableCollection.fromdict(lwt.asdict())
tables.provenances.add_row(encoded_provenance)
return tables.tree_sequence()
```