# Source code for msprime.likelihood

#
# Copyright (C) 2019 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime.  If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for computing likelihoods.
"""
import math

import _msprime

[docs]def unnormalised_log_mutation_likelihood(ts, mu):
"""
Returns the unnormalised log probability of the stored pattern of mutations
on the stored tree sequence, assuming infinite sites mutation. In particular,
each stored site must only contain a single mutation. The omitted normalising
constant depends on the pattern of mutations, but not on the tree sequence or
the mutation rate.

The function first computes the probability of the overall number of mutations
:math:M from the Poisson probability mass function

.. math::
e^{-T \\mu / 2} \\frac{(T \\mu / 2)^M}{M!},

where :math:T is the total area of ancestral material in the tree sequence,
stored in units of generations. Each mutation then contributes an individual factor
of :math:l / T, where :math:l is the total branch length on which the mutation
could have arisen while appearing on all of the required lineages, again stored
in generations.

.. warning::
If the tree sequence contains unary nodes, then :math:l could span more than
one edge. In particular, we do not constrain mutations to take place on the edge
directly above the node on which they have been recorded, but rather on any edge
which would yield the same configuration of SNPs at the leaves of the tree
sequence.

:param tskit.TreeSequence ts: The tree sequence object with mutations.
:param float mu: The per-site, per-generation mutation probablity.
Must be non-negative.
:return: The unnormalised log probability of the observed SNPs given the tree
sequence. If the mutation rate is set to zero and the tree sequence contains
at least one mutation, then returns -float("inf").
"""
tables = ts.tables
time = tables.nodes.time
total_material = 0
for e in tables.edges:
total_material += (e.right - e.left) * (time[e.parent] - time[e.child])
number_of_mutations = len(tables.mutations)
if mu == 0:
if number_of_mutations == 0:
ret = 0
else:
ret = -float("inf")
else:
ret = number_of_mutations * math.log(total_material * mu) - total_material * mu
for tree in ts.trees():
for site in tree.sites():
mutation = site.mutations[0]
child = mutation.node
parent = tree.parent(child)
potential_branch_length = tree.branch_length(child)
while (
tree.parent(parent) is not None and len(tree.children(parent)) == 1
):
child = parent
parent = tree.parent(child)
potential_branch_length += tree.branch_length(child)
child = mutation.node
while len(tree.children(child)) == 1:
child = tree.children(child)[0]
potential_branch_length += tree.branch_length(child)
ret += math.log(potential_branch_length / total_material)
return ret

[docs]def log_arg_likelihood(ts, recombination_rate, Ne=1):
"""
Returns the log probability of the stored tree sequence under the Hudson ARG.
An exact expression for this probability is given in equation (1) of
Kuhner et al. (2000) <https://www.genetics.org/content/156/3/1393>_.

We assume branch lengths stored in generations, resulting in a coalescence
rate of :math:1 / (2 N_e) per pair of lineages.

.. warning::
The stored tree sequence must store the full realisation of the ARG,
including all recombination events and all common ancestor events,
regardless of whether the recombinations cause a change in the ancestral
tree or whether the common ancestor events cause coalescence of ancestral
material. See :ref:sec_tutorial_record_full_arg for details of this
data structure, and how to generate them using msprime.

:param tskit.TreeSequence ts: The tree sequence object.
:param float recombination_rate: The per-link, per-generation recombination
probability. Must be non-negative.
:param float Ne: The diploid effective population size.
:return: The log probability of the tree sequence under the Hudson ancestral
recombination graph model. If the recombination rate is zero and the tree
sequence contains at least one recombination event, then returns
-float("inf").
"""
# Get the tables into the format we need to interchange with the low-level code.
lw_tables = _msprime.LightweightTableCollection()
lw_tables.fromdict(ts.tables.asdict())
return _msprime.log_likelihood_arg(
lw_tables, Ne=Ne, recombination_rate=recombination_rate
)