# Source code for msprime.ancestry

#
# Copyright (C) 2015-2020 University of Oxford
#
# This file is part of msprime.
#
# msprime is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msprime is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msprime.  If not, see <http://www.gnu.org/licenses/>.
#
"""
Module responsible for defining and running ancestry simulations.
"""
import bisect
import collections
import copy
import gzip
import inspect
import logging
import warnings

import attr
import numpy as np
import tskit

import _msprime
from . import core
from . import demography as demog
from . import mutations
from . import provenance

logger = logging.getLogger(__name__)

def model_factory(model):
"""
Returns a simulation model corresponding to the specified model.
- If model is None, the default simulation model is returned.
- If model is a string, return the corresponding model instance.
- If model is an instance of SimulationModel, return a copy of it.
- Otherwise raise a type error.
"""
model_map = {
"hudson": StandardCoalescent(),
"smc": SmcApproxCoalescent(),
"smc_prime": SmcPrimeApproxCoalescent(),
"dtwf": DiscreteTimeWrightFisher(),
"wf_ped": WrightFisherPedigree(),
}
if model is None:
model_instance = StandardCoalescent()
elif isinstance(model, str):
lower_model = model.lower()
if lower_model not in model_map:
raise ValueError(
"Model '{}' unknown. Choose from {}".format(
model, list(model_map.keys())
)
)
model_instance = model_map[lower_model]
elif not isinstance(model, SimulationModel):
raise TypeError(
"Simulation model must be a string or an instance of SimulationModel"
)
else:
model_instance = model
return model_instance

def parse_model_change_events(events):
"""
Parses the specified list of events provided in model_arg[1:] into
SimulationModelChange events. There are two different forms supported,
and model descriptions are anything supported by model_factory.
"""
err = (
"Simulation model change events must be either a two-tuple "
"(time, model), describing the time of the model change and "
"the new model or be an instance of SimulationModelChange."
)
model_change_events = []
for event in events:
if isinstance(event, (tuple, list)):
if len(event) != 2:
raise ValueError(err)
t = event[0]
if t is not None:
try:
t = float(t)
except (TypeError, ValueError):
raise ValueError(
"Model change times must be either a floating point "
"value or None"
)
event = SimulationModelChange(t, model_factory(event[1]))
elif isinstance(event, SimulationModelChange):
# We don't want to modify our inputs, so take a deep copy.
event = copy.copy(event)
event.model = model_factory(event.model)
else:
raise TypeError(err)
model_change_events.append(event)
return model_change_events

def parse_model_arg(model_arg):
"""
Parses the specified model argument from the simulate function,
returning the initial model and any model change events.
"""
err = (
"The model argument must be either (a) a value that can be "
"interpreted as a simulation model or (b) a list in which "
"the first element is a model description and the remaining "
"elements are model change events. These can either be described "
"by a (time, model) tuple or SimulationModelChange instances."
)
if isinstance(model_arg, (list, tuple)):
if len(model_arg) < 1:
raise ValueError(err)
model = model_factory(model_arg[0])
model_change_events = parse_model_change_events(model_arg[1:])
else:
model = model_factory(model_arg)
model_change_events = []
return model, model_change_events

def filter_events(demographic_events):
"""
Returns a tuple (demographic_events, model_change_events) which separates
out the SimulationModelChange events from the list. This is to support the
pre-1.0 syntax for model changes, where they were included in the
demographic_events parameter.
"""
filtered_events = []
model_change_events = []
for event in demographic_events:
if isinstance(event, SimulationModelChange):
model_change_events.append(event)
else:
filtered_events.append(event)
# Make sure any model references are resolved.
model_change_events = parse_model_change_events(model_change_events)
return filtered_events, model_change_events

def _check_population_configurations(population_configurations):
err = (
"Population configurations must be a list of PopulationConfiguration instances"
)
for config in population_configurations:
if not isinstance(config, demog.PopulationConfiguration):
raise TypeError(err)
if config.initial_size is not None and config.initial_size <= 0:
raise ValueError("Population size must be > 0")
if config.sample_size is not None and config.sample_size < 0:
raise ValueError("Sample size must be >= 0")

def _replicate_generator(
sim, mutation_generator, num_replicates, provenance_dict, end_time
):
"""
Generator function for the many-replicates case of the simulate
function.
"""
encoded_provenance = None
# The JSON is modified for each replicate to insert the replicate number.
# To avoid repeatedly encoding the same JSON (which can take milliseconds)
# we insert a replaceable string.
placeholder = "@@_MSPRIME_REPLICATE_INDEX_@@"
if provenance_dict is not None:
provenance_dict["parameters"]["replicate_index"] = placeholder
encoded_provenance = provenance.json_encode_provenance(
provenance_dict, num_replicates
)

for j in range(num_replicates):
sim.run(end_time)
replicate_provenance = None
if encoded_provenance is not None:
replicate_provenance = encoded_provenance.replace(
f'"{placeholder}"', str(j)
)
tree_sequence = sim.get_tree_sequence(mutation_generator, replicate_provenance)
yield tree_sequence
sim.reset()

def samples_factory(sample_size, samples, pedigree, population_configurations):
"""
Returns a list of Sample objects, given the specified inputs.
"""
the_samples = []
if sample_size is not None:
if samples is not None:
raise ValueError("Cannot specify sample size and samples simultaneously.")
if population_configurations is not None:
raise ValueError(
"Cannot specify sample size and population_configurations "
"simultaneously."
)
s = demog.Sample(population=0, time=0.0)
# NOTE: we need to review this before finalising the interface
# and see if this gels with the idea of having ploidy and
# individuals.
# In pedigrees samples are diploid individuals
if pedigree is not None:
sample_size *= 2  # TODO: Update for different ploidy
the_samples = [s for _ in range(sample_size)]
# If we have population configurations we may have embedded sample_size
# values telling us how many samples to take from each population.
if population_configurations is not None:
_check_population_configurations(population_configurations)
if samples is None:
the_samples = []
for j, conf in enumerate(population_configurations):
if conf.sample_size is not None:
the_samples += [demog.Sample(j, 0) for _ in range(conf.sample_size)]
else:
for conf in population_configurations:
if conf.sample_size is not None:
raise ValueError(
"Cannot specify population configuration sample size"
" and samples simultaneously"
)
the_samples = samples
elif samples is not None:
the_samples = samples
return the_samples

def demography_factory(
Ne, demography, population_configurations, migration_matrix, demographic_events
):
if demography is not None:
if population_configurations is not None:
raise ValueError(
"The demography and population_configurations options "
"cannot be used together"
)
if migration_matrix is not None:
raise ValueError(
"The demography and migration_matrix options cannot be used together"
)
if demographic_events is not None:
raise ValueError(
"The demography and demographic_events options cannot be used together"
)
# Take a copy so that we don't modify the input parameters when
# resolving defaults
demography = copy.deepcopy(demography)
else:
demography = demog.Demography.from_old_style(
population_configurations, migration_matrix, demographic_events
)

# For any populations in which the initial size is None set it to Ne
for pop in demography.populations:
if pop.initial_size is None:
pop.initial_size = Ne

demography.validate()
return demography

def simulator_factory(
sample_size=None,
Ne=1,
random_generator=None,
length=None,
recombination_rate=None,
recombination_map=None,
population_configurations=None,
pedigree=None,
migration_matrix=None,
samples=None,
demographic_events=None,
model=None,
record_migrations=False,
from_ts=None,
start_time=None,
record_full_arg=False,
num_labels=None,
gene_conversion_rate=None,
gene_conversion_track_length=None,
demography=None,
):
"""
Convenience method to create a simulator instance using the same
parameters as the simulate function.
"""
if Ne <= 0:
raise ValueError("Population size must be positive")

samples_specified = (
sample_size is None
and population_configurations is None
and samples is None
and from_ts is None
)
if samples_specified:
# TODO remove the population_configurations message here if we're
# deprecating it?
raise ValueError(
"Either sample_size, samples, population_configurations or from_ts must "
"be specified"
)
samples = samples_factory(sample_size, samples, pedigree, population_configurations)

model, model_change_events = parse_model_arg(model)
if demographic_events is not None:
demographic_events, old_style_model_change_events = filter_events(
demographic_events
)
if len(old_style_model_change_events) > 0:
if len(model_change_events) > 0:
raise ValueError(
"Cannot specify SimulationModelChange events using both new-style "
"and pre 1.0 syntax"
)
model_change_events = old_style_model_change_events

demography = demography_factory(
Ne, demography, population_configurations, migration_matrix, demographic_events
)

# The logic for checking from_ts and recombination map is bound together
# in a complicated way, so we can factor them out into separate functions.
if from_ts is None:
if len(samples) < 2:
raise ValueError("Sample size must be >= 2")
else:
if len(samples) > 0:
raise ValueError("Cannot specify samples with from_ts")
if not isinstance(from_ts, tskit.TreeSequence):
raise TypeError("from_ts must be a TreeSequence instance.")
if demography.num_populations != from_ts.num_populations:
raise ValueError(
"Mismatch in the number of populations in from_ts and simulation "
"parameters. The number of populations in the simulation must be "
"equal to the number of populations in from_ts"
)

if recombination_map is None:
# Default to 1 if no from_ts; otherwise default to the sequence length
# of from_ts
if from_ts is None:
the_length = 1 if length is None else length
else:
the_length = from_ts.sequence_length if length is None else length
the_rate = 0 if recombination_rate is None else recombination_rate
if the_length <= 0:
raise ValueError("Cannot provide non-positive sequence length")
if the_rate < 0:
raise ValueError("Cannot provide negative recombination rate")
recombination_map = RecombinationMap.uniform_map(
the_length, the_rate, discrete=False
)
else:
if not isinstance(recombination_map, RecombinationMap):
raise TypeError("RecombinationMap instance required")
if length is not None or recombination_rate is not None:
raise ValueError(
"Cannot specify length/recombination_rate along with "
"a recombination map"
)
if from_ts is not None:
if recombination_map.get_length() != from_ts.sequence_length:
raise ValueError(
"Recombination map and from_ts must have identical " "sequence_length"
)

if start_time is not None and start_time < 0:
raise ValueError("start_time cannot be negative")

if num_labels is not None and num_labels < 1:
raise ValueError("Must have at least one structured coalescent label")

# FIXME check the valid inputs for GC. Should we allow it when we
# have a non-trivial genetic map?
if gene_conversion_rate is None:
gene_conversion_rate = 0
else:
if not recombination_map.discrete:
raise ValueError(
"Cannot specify gene_conversion_rate along with "
"a nondiscrete recombination map"
)
if gene_conversion_track_length is None:
gene_conversion_track_length = 1

if random_generator is None:
# For the simulate code-path the rng will already be set, but
# for convenience we allow it to be null to help with writing
# tests.
random_generator = _msprime.RandomGenerator(core.get_random_seed())

sim = Simulator(
samples=samples,
recombination_map=recombination_map,
Ne=Ne,
random_generator=random_generator,
pedigree=pedigree,
model=model,
from_ts=from_ts,
store_migrations=record_migrations,
store_full_arg=record_full_arg,
start_time=start_time,
num_labels=num_labels,
gene_conversion_rate=gene_conversion_rate,
gene_conversion_track_length=gene_conversion_track_length,
demography=demography,
model_change_events=model_change_events,
)
return sim

[docs]def simulate(
sample_size=None,
Ne=1,
length=None,
recombination_rate=None,
recombination_map=None,
mutation_rate=None,
population_configurations=None,
pedigree=None,
migration_matrix=None,
demographic_events=None,
samples=None,
model=None,
record_migrations=False,
random_seed=None,
mutation_generator=None,
num_replicates=None,
replicate_index=None,
from_ts=None,
start_time=None,
end_time=None,
record_full_arg=False,
num_labels=None,
record_provenance=True,
# FIXME add documentation for these.
gene_conversion_rate=None,
gene_conversion_track_length=None,
demography=None,
):
"""
Simulates the coalescent with recombination under the specified model
parameters and returns the resulting :class:tskit.TreeSequence. Note that
Ne is the effective diploid population size (so the effective number
of genomes in the population is 2*Ne), but sample_size is the
number of (monoploid) genomes sampled.

:param int sample_size: The number of sampled monoploid genomes.  If not
specified or None, this defaults to the sum of the subpopulation sample
sizes. Either sample_size, population_configurations or
samples must be specified.
:param float Ne: The effective (diploid) population size for the reference
population. This defaults to 1 if not specified.
Please see the :ref:sec_api_simulation_models section for more details
on specifying simulations models.
:param float length: The length of the simulated region in bases.
This parameter cannot be used along with recombination_map.
Defaults to 1 if not specified.
:param float recombination_rate: The rate of recombination per base
per generation. This parameter cannot be used along with
recombination_map. Defaults to 0 if not specified.
:param recombination_map: The map
describing the changing rates of recombination along the simulated
chromosome. This parameter cannot be used along with the
recombination_rate or length parameters, as these
values are encoded within the map. Defaults to a uniform rate as
described in the recombination_rate parameter if not specified.
:type recombination_map: :class:.RecombinationMap
:param float mutation_rate: The rate of infinite sites
mutations per unit of sequence length per generation.
If not specified, no mutations are generated. This option only
allows for infinite sites mutations with a binary (i.e., 0/1)
alphabet. For more control over the mutational process, please
use the :func:.mutate function.
:param list population_configurations: The list of
:class:.PopulationConfiguration instances describing the
sampling configuration, relative sizes and growth rates of
the populations to be simulated. If this is not specified,
a single population with a sample of size sample_size
is assumed.
:type population_configurations: list or None.
:param list migration_matrix: The matrix describing the rates
of migration between all pairs of populations. If :math:N
populations are defined in the population_configurations
parameter, then the migration matrix must be an
:math:N \\times N matrix with 0 on the diagonal, consisting of
:math:N lists of length :math:N or an :math:N \\times N numpy
array, with the [j, k]th element giving the fraction of
population j that consists of migrants from population k in each
generation.
:param list demographic_events: The list of demographic events to
simulate. Demographic events describe changes to the populations
in the past. Events should be supplied in non-decreasing
order of time in the past. Events with the same time value will be
applied sequentially in the order that they were supplied before the
simulation algorithm continues with the next time step.
:param list samples: The list specifying the location and time of
all samples. This parameter may be used to specify historical
samples, and cannot be used in conjunction with the sample_size
parameter. Each sample is a (population, time) pair
such that the sample in position j in the list of samples
is drawn in the specified population at the specfied time. Time
is measured in generations ago, as elsewhere.
:param int random_seed: The random seed. If this is None, a
random seed will be automatically generated. Valid random
seeds must be between 1 and :math:2^{32} - 1.
:param int num_replicates: The number of replicates of the specified
parameters to simulate. If this is not specified or None,
no replication is performed and a :class:tskit.TreeSequence object
returned. If :obj:num_replicates is provided, the specified
number of replicates is performed, and an iterator over the
resulting :class:tskit.TreeSequence objects returned.
:param int replicate_index: Return only a specific tree
sequence from the set of replicates. This is used to recreate a specific tree
sequence from e.g. provenance. This argument only makes sense when used with
random seed, and is not compatible with num_replicates. Note also that
msprime will have to create and discard all the tree sequences up to this index.
:param tskit.TreeSequence from_ts: If specified, initialise the simulation
from the root segments of this tree sequence and return the
completed tree sequence. Please see :ref:here
<sec_api_simulate_from> for details on the required properties
of this tree sequence and its interactions with other parameters.
(Default: None).
:param float start_time: If specified, set the initial time that the
simulation starts to this value. If not specified, the start
time is zero if performing a simulation of a set of samples,
or is the time of the oldest node if simulating from an
existing tree sequence (see the from_ts parameter).
:param float end_time: If specified, terminate the simulation at the
specified time. In the returned tree sequence, all rootward paths from
samples with time < end_time will end in a node with one child with
time equal to end_time. Sample nodes with time >= end_time will
also be present in the output tree sequence. If not specified or None,
run the simulation until all samples have an MRCA at all positions in
the genome.
:param bool record_full_arg: If True, record all intermediate nodes
arising from common ancestor and recombination events in the output
tree sequence. This will result in unary nodes (i.e., nodes in marginal
trees that have only one child). Defaults to False.
:param model: The simulation model to use.
This can either be a string (e.g., "smc_prime") or an instance of
a simulation model class (e.g, msprime.DiscreteTimeWrightFisher(100).
Please see the :ref:sec_api_simulation_models section for more details
on specifying simulations models.
:type model: str or simulation model instance
:param bool record_provenance: If True, record all configuration and parameters
required to recreate the tree sequence. These can be accessed
via TreeSequence.provenances()).
:return: The :class:tskit.TreeSequence object representing the results
of the simulation if no replication is performed, or an
iterator over the independent replicates simulated if the
:obj:num_replicates parameter has been used.
:rtype: :class:tskit.TreeSequence or an iterator over
:class:tskit.TreeSequence replicates.
:warning: If using replication, do not store the results of the
iterator in a list! For performance reasons, the same
underlying object may be used for every TreeSequence
returned which will most likely lead to unexpected behaviour.
"""

seed = random_seed
if random_seed is None:
seed = core.get_random_seed()
seed = int(seed)
rng = _msprime.RandomGenerator(seed)

provenance_dict = None
if record_provenance:
argspec = inspect.getargvalues(inspect.currentframe())
# num_replicates is excluded as provenance is per replicate
# replicate index is excluded as it is inserted for each replicate
parameters = {
"command": "simulate",
**{
arg: argspec.locals[arg]
for arg in argspec.args
if arg not in ["num_replicates", "replicate_index"]
},
}
parameters["random_seed"] = seed
provenance_dict = provenance.get_provenance_dict(parameters)

sim = simulator_factory(
sample_size=sample_size,
random_generator=rng,
Ne=Ne,
length=length,
recombination_rate=recombination_rate,
recombination_map=recombination_map,
population_configurations=population_configurations,
pedigree=pedigree,
migration_matrix=migration_matrix,
demographic_events=demographic_events,
samples=samples,
model=model,
record_migrations=record_migrations,
from_ts=from_ts,
start_time=start_time,
record_full_arg=record_full_arg,
num_labels=num_labels,
gene_conversion_rate=gene_conversion_rate,
gene_conversion_track_length=gene_conversion_track_length,
demography=demography,
)

if mutation_generator is not None:
# This error was added in version 0.6.1.
raise ValueError(
"mutation_generator is not longer supported. Please use "
)

if mutation_rate is not None:
# There is ambiguity in how we should throw mutations onto partially
# built tree sequences: on the whole thing, or must the newly added
# topology? Before or after start_time? We avoid this complexity by
# asking the user to use mutate(), which should have the required
# flexibility.
if from_ts is not None:
raise ValueError(
"Cannot specify mutation rate combined with from_ts. Please use "
"msprime.mutate on the final tree sequence instead"
)
# There is ambiguity in how the start_time argument should interact with
# the mutation generator: should we throw mutations down on the whole
# tree or just the (partial) edges after start_time? To avoid complicating
# things here, make the user use mutate() which should have the flexibility
# to do whatever is needed.
if start_time is not None and start_time > 0:
raise ValueError(
"Cannot specify mutation rate combined with a non-zero "
"start_time. Please use msprime.mutate on the returned "
)
# TODO when the discrete parameter is added here, pass it through
# to make it a property of the mutation generator.
mutation_generator = mutations._simple_mutation_generator(
mutation_rate, sim.sequence_length, sim.random_generator
)
if replicate_index is not None and random_seed is None:
raise ValueError(
"Cannot specify replicate_index without random_seed as this "
"has the same effect as not specifying replicate_index i.e. a "
"random tree sequence"
)
if replicate_index is not None and num_replicates is not None:
raise ValueError(
"Cannot specify replicate_index with num_replicates as only "
"the replicate_index specified will be returned."
)
if num_replicates is None and replicate_index is None:
replicate_index = 0
if replicate_index is not None:
iterator = _replicate_generator(
sim, mutation_generator, replicate_index + 1, provenance_dict, end_time
)
# Return the last element of the iterator
deque = collections.deque(iterator, maxlen=1)
return deque.pop()
else:
return _replicate_generator(
sim, mutation_generator, num_replicates, provenance_dict, end_time
)

class Simulator(_msprime.Simulator):
"""
Class to simulate trees under a variety of population models.
"""

def __init__(
self,
samples,
recombination_map,
Ne,
random_generator,
demography,
model_change_events,
pedigree=None,
model=None,
from_ts=None,
store_migrations=False,
store_full_arg=False,
start_time=None,
num_labels=None,
gene_conversion_rate=0,
gene_conversion_track_length=1,
):
# We always need at least n segments, so no point in making
# allocation any smaller than this.
num_samples = len(samples) if samples is not None else from_ts.num_samples
block_size = 64 * 1024
segment_block_size = max(block_size, num_samples)
avl_node_block_size = block_size
node_mapping_block_size = block_size

if num_labels is None:
num_labels = self._choose_num_labels(model, model_change_events)

# Now, convert the high-level values into their low-level
# counterparts.
ll_pedigree = None
if pedigree is not None:
# TODO see notes on the WrightFisherPedigree; the pedigree should
# be a parameter of the *model*.
pedigree = self._check_pedigree(
pedigree, samples, model, demography, model_change_events
)
ll_pedigree = pedigree.get_ll_representation()
ll_simulation_model = model.get_ll_representation()
ll_population_configuration = [pop.asdict() for pop in demography.populations]
ll_demographic_events = [
event.get_ll_representation() for event in demography.events
]
ll_recomb_map = recombination_map.get_ll_recombination_map()
ll_tables = _msprime.LightweightTableCollection(
recombination_map.get_sequence_length()
)
if from_ts is not None:
from_ts_tables = from_ts.tables.asdict()
ll_tables.fromdict(from_ts_tables)
start_time = -1 if start_time is None else start_time
super().__init__(
samples=samples,
recombination_map=ll_recomb_map,
tables=ll_tables,
start_time=start_time,
random_generator=random_generator,
model=ll_simulation_model,
migration_matrix=demography.migration_matrix,
population_configuration=ll_population_configuration,
pedigree=ll_pedigree,
demographic_events=ll_demographic_events,
store_migrations=store_migrations,
store_full_arg=store_full_arg,
num_labels=num_labels,
segment_block_size=segment_block_size,
avl_node_block_size=avl_node_block_size,
node_mapping_block_size=node_mapping_block_size,
gene_conversion_rate=gene_conversion_rate,
gene_conversion_track_length=gene_conversion_track_length,
)
# attributes that are internal to the highlevel Simulator class
self._hl_from_ts = from_ts
# highlevel attributes used externally that have no lowlevel equivalent
self.model_change_events = model_change_events
self.demography = demography
self.recombination_map = recombination_map

@property
def tables(self):
# convert lowlevel tables to highlevel tables
return tskit.TableCollection.fromdict(super().tables.asdict())

@property
def sample_configuration(self):
"""
Returns the number of samples from each popuation.
"""
ret = [0 for _ in range(self.num_populations)]
for ll_sample in self.samples:
sample = demog.Sample(*ll_sample)
ret[sample.population] += 1
return ret

def _check_pedigree(
self, pedigree, samples, model, demography, model_change_events
):
# TODO this functionality is preliminary and undocumented, so this
# code should be expected to change.
if demography.num_populations != 1:
raise ValueError(
"Cannot yet specify population structure "
"and pedigrees simultaneously"
)
if not isinstance(model, WrightFisherPedigree):
raise ValueError("Pedigree can only be specified for wf_ped model")

if len(samples) % 2 != 0:
raise ValueError(
"In (diploid) pedigrees, must specify two lineages per individual."
)

if pedigree.is_sample is None:
pedigree.set_samples(num_samples=len(samples) // 2)

if sum(pedigree.is_sample) * 2 != len(samples):
raise ValueError(
"{} sample lineages to be simulated, but {} in pedigree".format(
len(samples), pedigree.num_samples * 2
)
)

pedigree_max_time = np.max(pedigree.time)
if len(demography.events) > 0:
de_min_time = min([x.time for x in demography.events])
if de_min_time <= pedigree_max_time:
raise NotImplementedError(
"Demographic events must be older than oldest pedigree founder."
)
if len(model_change_events) > 0:
mc_min_time = min([x.time for x in model_change_events])
if mc_min_time < pedigree_max_time:
raise NotImplementedError(
"Model change events earlier than founders of pedigree unsupported."
)

return pedigree

def _choose_num_labels(self, model, model_change_events):
"""
Choose the number of labels appropriately, given the simulation
models that will be simulated.
"""
num_labels = 1
models = [model] + [event.model for event in model_change_events]
for model in models:
if isinstance(model, SweepGenicSelection):
num_labels = 2
return num_labels

def _run_until(self, end_time, event_chunk=None):
# This is a pretty big default event chunk so that we don't spend
# too much time going back and forth into Python. We could imagine
# doing something a bit more sophisticated where we try to tune the
# number of events so that we end up with roughly 10 second slices
# (say).
if event_chunk is None:
event_chunk = 10 ** 4
if event_chunk <= 0:
raise ValueError("Must have at least 1 event per chunk")
logger.info("Running model %s until max time: %f", self.model, end_time)
while super().run(end_time, event_chunk) == _msprime.EXIT_MAX_EVENTS:
logger.debug("time=%g ancestors=%d", self.time, self.num_ancestors)

def run(self, end_time=None, event_chunk=None):
"""
Runs the simulation until complete coalescence has occurred.
"""
for event in self.model_change_events:
# If the event time is a callable, we compute the end_time
# as a function of the current simulation time.
current_time = self.time
model_start_time = event.time
if callable(event.time):
model_start_time = event.time(current_time)
# If model_start_time is None, we run until the current
# model completes. Note that when event.time is a callable
# it can also return None for this behaviour.
if model_start_time is None:
model_start_time = np.inf
if model_start_time < current_time:
raise ValueError(
"Model start times out of order or not computed correctly. "
f"current time = {current_time}; start_time = {model_start_time}"
)
self._run_until(model_start_time, event_chunk)
ll_new_model = event.model.get_ll_representation()
self.model = ll_new_model
end_time = np.inf if end_time is None else end_time
self._run_until(end_time, event_chunk)
self.finalise_tables()
logger.info(
"Completed at time=%g nodes=%d edges=%d",
self.time,
self.num_nodes,
self.num_edges,
)

def get_tree_sequence(self, mutation_generator=None, provenance_record=None):
"""
Returns a TreeSequence representing the state of the simulation.
"""
if mutation_generator is not None:
mutation_generator.generate(super().tables)
tables = self.tables
if provenance_record is not None:
if self._hl_from_ts is None:
assert len(tables.populations) == self.demography.num_populations
tables.populations.clear()
for population in self.demography.populations:
)
return tables.tree_sequence()

[docs]class RecombinationMap:
"""
A RecombinationMap represents the changing rates of recombination
along a chromosome. This is defined via two lists of numbers:
positions and rates, which must be of the same length.
Given an index j in these lists, the rate of recombination
per base per generation is rates[j] over the interval
positions[j] to positions[j + 1]. Consequently, the first
position must be zero, and by convention the last rate value
is also required to be zero (although it is not used).

.. warning::
The num_loci parameter is deprecated. To set a discrete number of
possible recombination sites along the sequence, scale positions
to the desired number of sites and set discrete=True to ensure
recombination occurs only at integer values.

:param list positions: The positions (in bases) denoting the
distinct intervals where recombination rates change. These can
be floating point values.
:param list rates: The list of rates corresponding to the supplied
positions. Recombination rates are specified per base,
per generation.
:param int num_loci: **This parameter is deprecated**.
The maximum number of non-recombining loci
in the underlying simulation. By default this is set to
the largest possible value, allowing the maximum resolution
in the recombination process. However, for a finite sites
model this can be set to smaller values.
:param bool discrete: Whether recombination can occur only at integer
positions. When False, recombination sites can take continuous
values. To simulate a fixed number of loci, set this parameter to
True and scale positions to span the desired number of loci.
"""

def __init__(self, positions, rates, num_loci=None, discrete=False, map_start=0):
if num_loci is not None:
if num_loci == positions[-1]:
warnings.warn("num_loci is no longer supported and should not be used.")
else:
raise ValueError(
"num_loci does not match sequence length. "
"To set a discrete number of recombination sites, "
"scale positions to span the desired number of loci "
"and set discrete=True"
)
self._ll_recombination_map = _msprime.RecombinationMap(
positions, rates, discrete
)
self.map_start = map_start

[docs]    @classmethod
def uniform_map(cls, length, rate, num_loci=None, discrete=False):
"""
Returns a :class:.RecombinationMap instance in which the recombination
rate is constant over a chromosome of the specified length. The optional
discrete controls whether recombination sites can occur only on integer
positions or can take continuous values. The legacy num_loci option is
no longer supported and should not be used.

The following map can be used to simulate a true finite locus model
with a fixed number of loci m::

>>> recomb_map = RecombinationMap.uniform_map(m, rate, discrete=True)

:param float length: The length of the chromosome.
:param float rate: The rate of recombination per unit of sequence length
along this chromosome.
:param int num_loci: This parameter is no longer supported.
:param bool discrete: Whether recombination can occur only at integer
positions. When False, recombination sites can take continuous
values. To simulate a fixed number of loci, set this parameter to
True and set length to the desired number of loci.
"""
return cls([0, length], [rate, 0], num_loci=num_loci, discrete=discrete)

[docs]    @classmethod
"""
Parses the specified file in HapMap format. These files must be
white-space-delimited, and contain a single header line (which is
ignored), and then each subsequent line contains the starting position
and recombination rate for the segment from that position (inclusive)
to the starting position on the next line (exclusive). Starting
positions of each segment are given in units of bases, and
recombination rates in centimorgans/Megabase. The first column in this
file is ignored, as are additional columns after the third (Position is
assumed to be the second column, and Rate is assumed to be the third).
If the first starting position is not equal to zero, then a
zero-recombination region is inserted at the start of the chromosome.

A sample of this format is as follows::

Chromosome	Position(bp)	Rate(cM/Mb)	Map(cM)
chr1	55550	        2.981822	0.000000
chr1	82571	        2.082414	0.080572
chr1	88169	        2.081358	0.092229
chr1	254996	        3.354927	0.439456
chr1	564598	        2.887498	1.478148
...
chr1	182973428	2.512769	122.832331
chr1	183630013	0.000000	124.482178

:param str filename: The name of the file to be parsed. This may be
in plain text or gzipped plain text.
"""
positions = []
rates = []
if filename.endswith(".gz"):
f = gzip.open(filename)
else:
f = open(filename)
try:
# Skip the header line
for j, line in enumerate(f):
pos, rate, = map(float, line.split()[1:3])
if j == 0:
map_start = pos
if pos != 0:
positions.append(0)
rates.append(0)
positions.append(pos)
# Rate is expressed in centimorgans per megabase, which
# we convert to per-base rates
rates.append(rate * 1e-8)
if rate != 0:
raise ValueError(
"The last rate provided in the recombination map must be zero"
)
finally:
f.close()
return cls(positions, rates, map_start=map_start)

@property
def mean_recombination_rate(self):
"""
Return the weighted mean recombination rate
across all windows of the entire recombination map.
"""
chrom_length = self._ll_recombination_map.get_sequence_length()

positions = self._ll_recombination_map.get_positions()
positions_diff = self._ll_recombination_map.get_positions()[1:]
positions_diff = np.append(positions_diff, chrom_length)
window_sizes = positions_diff - positions

weights = window_sizes / chrom_length
if self.map_start != 0:
weights[0] = 0
rates = self._ll_recombination_map.get_rates()

return np.average(rates, weights=weights)

[docs]    def slice(self, start=None, end=None, trim=False):  # noqa: A003
"""
Returns a subset of this recombination map between the specified end
points. If start is None, it defaults to 0. If end is None, it defaults
to the end of the map. If trim is True, remove the flanking
zero recombination rate regions such that the sequence length of the
new recombination map is end - start.
"""
positions = self.get_positions()
rates = self.get_rates()

if start is None:
i = 0
start = 0
if end is None:
end = positions[-1]
j = len(positions)

if (
start < 0
or end < 0
or start > positions[-1]
or end > positions[-1]
or start > end
):
raise IndexError(f"Invalid subset: start={start}, end={end}")

if start != 0:
i = bisect.bisect_left(positions, start)
if start < positions[i]:
i -= 1
if end != positions[-1]:
j = bisect.bisect_right(positions, end, lo=i)

new_positions = list(positions[i:j])
new_rates = list(rates[i:j])
new_positions[0] = start
if end > new_positions[-1]:
new_positions.append(end)
new_rates.append(0)
else:
new_rates[-1] = 0
if trim:
new_positions = [pos - start for pos in new_positions]
else:
if new_positions[0] != 0:
if new_rates[0] == 0:
new_positions[0] = 0
else:
new_positions.insert(0, 0)
new_rates.insert(0, 0.0)
if new_positions[-1] != positions[-1]:
new_positions.append(positions[-1])
new_rates.append(0)
return self.__class__(new_positions, new_rates, discrete=self.discrete)

def __getitem__(self, key):
"""
Use slice syntax for obtaining a recombination map subset. E.g.
>>> recomb_map_4m_to_5m = recomb_map[4e6:5e6]
"""
if not isinstance(key, slice) or key.step is not None:
raise TypeError("Only interval slicing is supported")
start, end = key.start, key.stop
if start is not None and start < 0:
start += self.get_sequence_length()
if end is not None and end < 0:
end += self.get_sequence_length()
return self.slice(start=start, end=end, trim=True)

def get_ll_recombination_map(self):
return self._ll_recombination_map

def physical_to_genetic(self, physical_x):
return self._ll_recombination_map.position_to_mass(physical_x)

def physical_to_discrete_genetic(self, physical_x):
raise ValueError("Discrete genetic space is no longer supported")

def genetic_to_physical(self, genetic_x):
return self._ll_recombination_map.mass_to_position(genetic_x)

def get_total_recombination_rate(self):
return self._ll_recombination_map.get_total_recombination_rate()

def get_per_locus_recombination_rate(self):
raise ValueError("Genetic loci are no longer supported")

def get_size(self):
return self._ll_recombination_map.get_size()

def get_num_loci(self):
raise ValueError("num_loci is no longer supported")

def get_positions(self):
# For compatability with existing code we convert to a list
return list(self._ll_recombination_map.get_positions())

def get_rates(self):
# For compatability with existing code we convert to a list
return list(self._ll_recombination_map.get_rates())

def get_sequence_length(self):
return self._ll_recombination_map.get_sequence_length()

def get_length(self):
# Deprecated: use sequence_length instead
return self.get_sequence_length()

@property
def discrete(self):
return self._ll_recombination_map.get_discrete()

def asdict(self):
return {
"positions": self.get_positions(),
"rates": self.get_rates(),
"discrete": self.discrete,
"map_start": self.map_start,
}

# TODO update the documentation here to state that using this class is
# deprecated, and users should use the model=[...] notation instead.
[docs]@attr.s
class SimulationModelChange:
"""
An event representing a change of underlying :ref:simulation model
<sec_api_simulation_models>.

:param float time: The time at which the simulation model changes
to the new model, in generations. After this time, all internal
tree nodes, edges and migrations are the result of the new model.
If time is set to None (the default), the model change will occur
immediately after the previous model has completed. If time is a
callable, the time at which the simulation model changes is the result
of calling this function with the time that the previous model
started with as a parameter.
:param model: The new simulation model to use.
This can either be a string (e.g., "smc_prime") or an instance of
a simulation model class (e.g, msprime.DiscreteTimeWrightFisher(100).
Please see the :ref:sec_api_simulation_models section for more details
on specifying simulations models. If the argument is a string, the
reference population size is set from the top level Ne parameter
to :func:.simulate. If this is None (the default) the model is
changed to the standard coalescent with a reference_size of
Ne (if model was not specified).
:type model: str or simulation model instance
"""

time = attr.ib(default=None)
model = attr.ib(default=None)

def asdict(self):
return attr.asdict(self)

@attr.s
class SimulationModel:
"""
Abstract superclass of all simulation models.
"""

name = None

def get_ll_representation(self):
return {"name": self.name}

def asdict(self):
return attr.asdict(self)

[docs]class StandardCoalescent(SimulationModel):
"""
The classical coalescent with recombination model (i.e., Hudson's algorithm).
The string "hudson" can be used to refer to this model.

This is the default simulation model.
"""

name = "hudson"

[docs]class SmcApproxCoalescent(SimulationModel):
"""
The original SMC model defined by McVean and Cardin. This
model is implemented using a naive rejection sampling approach
and so it may not be any more efficient to simulate than the
standard Hudson model.

The string "smc" can be used to refer to this model.
"""

name = "smc"

[docs]class SmcPrimeApproxCoalescent(SimulationModel):
"""
The SMC' model defined by Marjoram and Wall as an improvement on the
original SMC. model is implemented using a naive rejection sampling
approach and so it may not be any more efficient to simulate than the
standard Hudson model.

The string "smc_prime" can be used to refer to this model.
"""

name = "smc_prime"

[docs]class DiscreteTimeWrightFisher(SimulationModel):
"""
A discrete backwards-time Wright-Fisher model, with diploid back-and-forth
recombination. The string "dtwf" can be used to refer to this model.

Wright-Fisher simulations are performed very similarly to coalescent
simulations, with all parameters denoting the same quantities in both
models. Because events occur at discrete times however, the order in which
they occur matters. Each generation consists of the following ordered
events:

- Migration events. As in the Hudson coalescent, these move single extant
lineages between populations. Because migration events occur before
lineages choose parents, migrant lineages choose parents from their new
population in the same generation.
- Demographic events. All events with previous_generation < event_time <=
current_generation are carried out here.
- Lineages draw parents. Each (monoploid) extant lineage draws a parent
from their current population.
- Diploid recombination. Each parent is diploid, so all child lineages
recombine back-and-forth into the same two parental genome copies. These
become two independent lineages in the next generation.
- Historical sampling events. All historical samples with
previous_generation < sample_time <= current_generation are inserted.

"""

name = "dtwf"

class WrightFisherPedigree(SimulationModel):
# TODO Complete documentation.
# TODO Since the pedigree is a necessary parameter for this simulation
# model and it cannot be used with any other model we should make it a
# parametric model where the parameter is the pedigree. This would
# streamline a bunch of logic.
"""
Backwards-time simulations through a pre-specified pedigree, with diploid
individuals and back-and-forth recombination. The string "wf_ped" can
be used to refer to this model.
"""
name = "wf_ped"

class ParametricSimulationModel(SimulationModel):
"""
The superclass of simulation models that require extra parameters.
"""

def get_ll_representation(self):
d = super().get_ll_representation()
d.update(self.__dict__)
return d

[docs]@attr.s
class BetaCoalescent(ParametricSimulationModel):
"""
A diploid Xi-coalescent with up to four simultaneous multiple mergers and
crossover recombination.

There are two main differences between the Beta-Xi-coalescent and the
standard coalescent. Firstly, the number of lineages that take part in each
common ancestor event is random, with distribution determined by moments of
the :math:Beta(2 - \\alpha, \\alpha)-distribution. In particular, when there
are :math:n lineages, each set of :math:k \\leq n of them participates in a
common ancestor event at rate

.. math::
\\frac{8}{B(2 - \\alpha, \\alpha)}
\\int_0^1 x^{k - \\alpha - 1} (1 - x)^{n - k + \\alpha - 1} dx,

where :math:B(2 - \\alpha, \\alpha) is the Beta-function.

In a common ancestor event, all participating lineages are randomly split
into four groups, corresponding to the four parental chromosomes in a diploid,
bi-parental reproduction event. All lineages within each group merge simultaneously.

.. warning::
The prefactor of 8 in the common ancestor event rate arises as the product
of two terms. A factor of 4 compensates for the fact that only one quarter
of binary common ancestor events result in a merger due to diploidy.
A further factor of 2 is included for consistency with the implementation
of the Hudson model, in which :math:n lineages undergo binary mergers at
rate :math:n (n - 1).

Secondly, the time scale predicted by the Beta-Xi-coalescent is proportional
to :math:N_e^{\\alpha - 1} generations, where :math:N_e is the effective
population size. Specifically, one unit of coalescent time corresponds to
a number of generations given by

.. math::
\\frac{m^{\\alpha} N_e^{\\alpha - 1}}{\\alpha B(2 - \\alpha, \\alpha)},

where

.. math::
m = 2 + \\frac{2^{\\alpha}}{3^{\\alpha - 1} (\\alpha - 1)}.

Note that the time scale depends both on the effective population size :math:N_e
and :math:\\alpha, and can be dramatically shorter than the timescale of the
standard coalescent. Thus, effective population sizes must often be many orders
of magnitude larger than census population sizes. The per-generation recombination
rate is rescaled similarly to obtain the population-rescaled recombination rate.

See Schweinsberg (2003)
<https://www.sciencedirect.com/science/article/pii/S0304414903000280>_
for the derivation of the common ancestor event rate, as well as the time scaling.
Note however that the model of Schweinsberg (2003) is haploid, so that
all participating lineages merge in a common ancestor event without
splitting into four groups.

:param float alpha: Determines the degree of skewness in the family size
distribution, and must satisfy :math:1 < \\alpha < 2. Smaller values of
:math:\\alpha correspond to greater skewness, and :math:\\alpha = 2
would coincide with the standard coalescent.
:param float truncation_point: Determines the maximum fraction of the
population replaced by offspring in one reproduction event, and must
satisfy :math:0 < \\tau \\leq 1, where :math:\\tau is the truncation point.
The default is :math:\\tau = 1, which corresponds to the standard
Beta-Xi-coalescent. When :math:\\tau < 1, the number of lineages
participating in a common ancestor event is determined by moments
of the :math:Beta(2 - \\alpha, \\alpha) distribution conditioned on not
exceeding :math:\\tau, and the Beta-function in the expression
for the time scale is also replaced by the incomplete Beta function
:math:Beta(\\tau; 2 - \\alpha, \\alpha).
"""

name = "beta"

alpha = attr.ib(default=None)
truncation_point = attr.ib(default=1)

[docs]@attr.s
class DiracCoalescent(ParametricSimulationModel):
"""
A diploid Xi-coalescent with up to four simultaneous multiple mergers and
crossover recombination.

The Dirac-Xi-coalescent is an implementation of the model of
Blath et al. (2013) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3527250/>_
The simulation proceeds similarly to the standard coalescent.
In addition to binary common ancestor events at rate :math:n (n - 1) when
there are :math:n lineages, potential multiple merger events take place
at rate :math:2 c > 0. Each lineage participates in each multiple merger
event independently with probability :math:0 < \\psi \\leq 1. All participating
lineages are randomly split into four groups, corresponding to the four
parental chromosomes present in a diploid, bi-parental reproduction event,
and the lineages within each group merge simultaneously.

.. warning::
The Dirac-Xi-coalescent is obtained as a scaling limit of Moran models,
rather than Wright-Fisher models. As a consequence, one unit of coalescent
time is proportional to :math:N_e^2 generations,
rather than :math:N_e generations as in the standard coalescent.
However, the coalescent recombination rate is obtained from the
per-generation recombination probability by rescaling with
:math:N_e. See :ref:sec_tutorial_multiple_mergers
for an illustration of how this affects simulation output in practice.

:param float c: Determines the rate of potential multiple merger events.
We require :math:c > 0.
:param float psi: Determines the fraction of the population replaced by
offspring in one large reproduction event, i.e. one reproduction event
giving rise to potential multiple mergers when viewed backwards in time.
We require :math:0 < \\psi \\leq 1.
"""

name = "dirac"

psi = attr.ib(default=None)
c = attr.ib(default=None)

@attr.s
class SweepGenicSelection(ParametricSimulationModel):
# TODO document and finalise the API
name = "sweep_genic_selection"

position = attr.ib(default=None)
start_frequency = attr.ib(default=None)
end_frequency = attr.ib(default=None)
alpha = attr.ib(default=None)
dt = attr.ib(default=None)