Source code for msprime.likelihood

#
# Copyright (C) 2019 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime.  If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for computing likelihoods.
"""
import math

import numpy as np

from msprime import _msprime


[docs]def log_mutation_likelihood(ts, mutation_rate): """ Returns the unnormalised log probability of the stored pattern of mutations on the stored tree sequence, assuming infinite sites mutation. In particular, each stored site must only contain a single mutation, and all mutant alleles are treated interchangeably as having arisen from a single mutation rate. This is automatically true for some msprime mutation models, such as JC69. The omitted normalising constant depends on the pattern of mutations, but not on the tree sequence or the mutation rate. .. warning:: The infinite sites assumption means that this method is only valid for continuous genomes. It can also be run on discrete genomes, for which it may provide a reasonable approximation when the number of sites is large and the mutation rate is low. See :ref:`sec_ancestry_discrete_genome` for how continuous genomes can be specified when simulating tree sequences using msprime. The function first computes the probability of the overall number of mutations :math:`M` from the Poisson probability mass function .. math:: e^{-T \\mu / 2} \\frac{(T \\mu / 2)^M}{M!}, where :math:`T` is the total area of ancestral material in the tree sequence stored in units of generations, and :math:`\\mu` is the per-site, per-generation mutation probability. Each mutation then contributes an individual factor of :math:`l / T`, where :math:`l` is the total branch length on which the mutation could have arisen while appearing on all of the required lineages, again stored in generations. .. warning:: If a tree at the site of a mutation contains unary nodes, then :math:`l` could span more than one edge. In particular, we do not constrain mutations to take place on the edge directly above the node on which they have been recorded, but rather on any edge which would yield the same configuration of SNPs at the leaves of the tree sequence. :param tskit.TreeSequence ts: The tree sequence object with mutations. :param float mutation_rate: The per-site, per-generation mutation probability. Must be non-negative. :return: The unnormalised log probability of the observed SNPs given the tree sequence. If the mutation rate is set to zero and the tree sequence contains at least one mutation, then returns `-float("inf")`. """ tables = ts.tables time = tables.nodes.time total_material = 0 for e in tables.edges: total_material += (e.right - e.left) * (time[e.parent] - time[e.child]) number_of_mutations = len(tables.mutations) if mutation_rate == 0: if number_of_mutations == 0: ret = 0 else: ret = -float("inf") else: ret = ( number_of_mutations * math.log(total_material * mutation_rate) - total_material * mutation_rate ) for tree in ts.trees(): for site in tree.sites(): mutation = site.mutations[0] child = mutation.node parent = tree.parent(child) potential_branch_length = tree.branch_length(child) while ( tree.parent(parent) is not None and len(tree.children(parent)) == 1 ): child = parent parent = tree.parent(child) potential_branch_length += tree.branch_length(child) child = mutation.node while len(tree.children(child)) == 1: child = tree.children(child)[0] potential_branch_length += tree.branch_length(child) ret += math.log(potential_branch_length / total_material) return ret
[docs]def log_arg_likelihood(ts, recombination_rate, Ne=1): """ Returns the log probability of the stored tree sequence under the Hudson ARG. An exact expression for this probability is given in equation (1) of `Kuhner et al. (2000) <https://www.genetics.org/content/156/3/1393>`_. We assume branch lengths stored in generations, resulting in a coalescence rate of :math:`1 / (2 N_e)` per pair of lineages. .. warning:: The stored tree sequence must store the full realisation of the ARG, including all recombination events and all common ancestor events, regardless of whether the recombinations cause a change in the ancestral tree or whether the common ancestor events cause coalescence of ancestral material. See :ref:`sec_ancestry_full_arg` for details of this data structure, and how to generate them using ``msprime``. .. warning:: This method only supports continuous genomes. See :ref:`sec_ancestry_discrete_genome` for how these can be specified when simulating tree sequences using ``msprime``. :param tskit.TreeSequence ts: The tree sequence object. :param float recombination_rate: The per-link, per-generation recombination probability. Must be non-negative. :param float Ne: The diploid effective population size. :return: The log probability of the tree sequence under the Hudson ancestral recombination graph model. If the recombination rate is zero and the tree sequence contains at least one recombination event, then returns `-DBL_MAX`. """ for tree in ts.trees(): if np.any(tree.num_children_array[:-1] > 2): raise ValueError( "ARG likelihood encountered a polytomy." " Tree sequences must contain binary mergers only for" " valid likelihood evaluation." ) if tree.num_children_array[-1] > 1: if ts.num_edges > 1: # num_edges check is here because to avoid breaking the expected # result of the TestOddToplogies tests. raise ValueError( "ARG likelihood encountered a tree with multiple roots." " All local trees must have a single mrca for" " valid likelihood evaluation." ) if ts.num_trees > 1 and not np.any(ts.nodes_flags & _msprime.NODE_IS_RE_EVENT): raise ValueError( "ARG likelihood only valid for tree sequences where recombinations" " have been recorded via nodes flagged with `NODE_IS_RE_EVENT`." ) # Get the tables into the format we need to interchange with the low-level code. lw_tables = _msprime.LightweightTableCollection() lw_tables.fromdict(ts.tables.asdict()) return _msprime.log_likelihood_arg( lw_tables, Ne=Ne, recombination_rate=recombination_rate )