Source code for tskit.tables

#
# MIT License
#
# Copyright (c) 2018-2024 Tskit Developers
# Copyright (c) 2017 University of Oxford
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Tree sequence IO via the tables API.
"""
import collections.abc
import dataclasses
import datetime
import json
import numbers
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from functools import reduce
from typing import Dict
from typing import Optional
from typing import Union

import numpy as np

import _tskit
import tskit
import tskit.metadata as metadata
import tskit.provenance as provenance
import tskit.util as util
from tskit import UNKNOWN_TIME

dataclass_options = {"frozen": True}


# Needed for cases where `None` can be an appropriate kwarg value,
# we override the meta so that it looks good in the docs.
class NotSetMeta(type):
    def __repr__(cls):
        return "Not set"


class NOTSET(metaclass=NotSetMeta):
    pass


[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class IndividualTableRow(util.Dataclass): """ A row in an :class:`IndividualTable`. """ __slots__ = ["flags", "location", "parents", "metadata"] flags: int """ See :attr:`Individual.flags` """ location: np.ndarray """ See :attr:`Individual.location` """ parents: np.ndarray """ See :attr:`Individual.parents` """ metadata: Optional[Union[bytes, dict]] """ See :attr:`Individual.metadata` """ # We need a custom eq for the numpy arrays def __eq__(self, other): return ( isinstance(other, IndividualTableRow) and self.flags == other.flags and np.array_equal(self.location, other.location) and np.array_equal(self.parents, other.parents) and self.metadata == other.metadata )
[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class NodeTableRow(util.Dataclass): """ A row in a :class:`NodeTable`. """ __slots__ = ["flags", "time", "population", "individual", "metadata"] flags: int """ See :attr:`Node.flags` """ time: float """ See :attr:`Node.time` """ population: int """ See :attr:`Node.population` """ individual: int """ See :attr:`Node.individual` """ metadata: Optional[Union[bytes, dict]] """ See :attr:`Node.metadata` """
[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class EdgeTableRow(util.Dataclass): """ A row in an :class:`EdgeTable`. """ __slots__ = ["left", "right", "parent", "child", "metadata"] left: float """ See :attr:`Edge.left` """ right: float """ See :attr:`Edge.right` """ parent: int """ See :attr:`Edge.parent` """ child: int """ See :attr:`Edge.child` """ metadata: Optional[Union[bytes, dict]] """ See :attr:`Edge.metadata` """
[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class MigrationTableRow(util.Dataclass): """ A row in a :class:`MigrationTable`. """ __slots__ = ["left", "right", "node", "source", "dest", "time", "metadata"] left: float """ See :attr:`Migration.left` """ right: float """ See :attr:`Migration.right` """ node: int """ See :attr:`Migration.node` """ source: int """ See :attr:`Migration.source` """ dest: int """ See :attr:`Migration.dest` """ time: float """ See :attr:`Migration.time` """ metadata: Optional[Union[bytes, dict]] """ See :attr:`Migration.metadata` """
[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class SiteTableRow(util.Dataclass): """ A row in a :class:`SiteTable`. """ __slots__ = ["position", "ancestral_state", "metadata"] position: float """ See :attr:`Site.position` """ ancestral_state: str """ See :attr:`Site.ancestral_state` """ metadata: Optional[Union[bytes, dict]] """ See :attr:`Site.metadata` """
[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class MutationTableRow(util.Dataclass): """ A row in a :class:`MutationTable`. """ __slots__ = ["site", "node", "derived_state", "parent", "metadata", "time"] site: int """ See :attr:`Mutation.site` """ node: int """ See :attr:`Mutation.node` """ derived_state: str """ See :attr:`Mutation.derived_state` """ parent: int """ See :attr:`Mutation.parent` """ metadata: Optional[Union[bytes, dict]] """ See :attr:`Mutation.metadata` """ time: float """ See :attr:`Mutation.time` """ # We need a custom eq here as we have unknown times (nans) to check def __eq__(self, other): return ( isinstance(other, MutationTableRow) and self.site == other.site and self.node == other.node and self.derived_state == other.derived_state and self.parent == other.parent and self.metadata == other.metadata and ( self.time == other.time or ( util.is_unknown_time(self.time) and util.is_unknown_time(other.time) ) ) )
[docs]@metadata.lazy_decode() @dataclass(**dataclass_options) class PopulationTableRow(util.Dataclass): """ A row in a :class:`PopulationTable`. """ __slots__ = ["metadata"] metadata: Optional[Union[bytes, dict]] """ See :attr:`Population.metadata` """
[docs]@dataclass(**dataclass_options) class ProvenanceTableRow(util.Dataclass): """ A row in a :class:`ProvenanceTable`. """ __slots__ = ["timestamp", "record"] timestamp: str """ See :attr:`Provenance.timestamp` """ record: str """ See :attr:`Provenance.record` """
[docs]@dataclass(**dataclass_options) class TableCollectionIndexes(util.Dataclass): """ A class encapsulating the indexes of a :class:`TableCollection` """ edge_insertion_order: np.ndarray = None edge_removal_order: np.ndarray = None
[docs] def asdict(self): return {k: v for k, v in dataclasses.asdict(self).items() if v is not None}
@property def nbytes(self) -> int: """ The number of bytes taken by the indexes """ total = 0 if self.edge_removal_order is not None: total += self.edge_removal_order.nbytes if self.edge_insertion_order is not None: total += self.edge_insertion_order.nbytes return total
def keep_with_offset(keep, data, offset): """ Used when filtering _offset columns in tables """ # We need the astype here for 32 bit machines lens = np.diff(offset).astype(np.int32) return ( data[np.repeat(keep, lens)], np.concatenate( [ np.array([0], dtype=offset.dtype), np.cumsum(lens[keep], dtype=offset.dtype), ] ), ) class BaseTable: """ Superclass of high-level tables. Not intended for direct instantiation. """ # The list of columns in the table. Must be set by subclasses. column_names = [] def __init__(self, ll_table, row_class): self.ll_table = ll_table self.row_class = row_class def _check_required_args(self, **kwargs): for k, v in kwargs.items(): if v is None: raise TypeError(f"{k} is required") @property def num_rows(self) -> int: return self.ll_table.num_rows @property def max_rows(self) -> int: return self.ll_table.max_rows @property def max_rows_increment(self) -> int: return self.ll_table.max_rows_increment @property def nbytes(self) -> int: """ Returns the total number of bytes required to store the data in this table. Note that this may not be equal to the actual memory footprint. """ # It's not ideal that we run asdict() here to do this as we're # currently creating copies of the column arrays, so it would # be more efficient to have dedicated low-level methods. However, # if we do have read-only views on the underlying memory for the # column arrays then this will be a perfectly good way of # computing the nbytes values and the overhead minimal. d = self.asdict() nbytes = 0 # Some tables don't have a metadata_schema metadata_schema = d.pop("metadata_schema", None) if metadata_schema is not None: nbytes += len(metadata_schema.encode()) nbytes += sum(col.nbytes for col in d.values()) return nbytes def equals(self, other, ignore_metadata=False): """ Returns True if `self` and `other` are equal. By default, two tables are considered equal if their columns and metadata schemas are byte-for-byte identical. :param other: Another table instance :param bool ignore_metadata: If True exclude metadata and metadata schemas from the comparison. :return: True if other is equal to this table; False otherwise. :rtype: bool """ # Note: most tables support ignore_metadata, we can override for those that don't ret = False if type(other) is type(self): ret = bool( self.ll_table.equals(other.ll_table, ignore_metadata=ignore_metadata) ) return ret def assert_equals(self, other, *, ignore_metadata=False): """ Raise an AssertionError for the first found difference between this and another table of the same type. :param other: Another table instance :param bool ignore_metadata: If True exclude metadata and metadata schemas from the comparison. """ if type(other) is not type(self): raise AssertionError(f"Types differ: self={type(self)} other={type(other)}") # Check using the low-level method to avoid slowly going through everything if self.equals(other, ignore_metadata=ignore_metadata): return if not ignore_metadata and self.metadata_schema != other.metadata_schema: raise AssertionError( f"{type(self).__name__} metadata schemas differ: " f"self={self.metadata_schema} " f"other={other.metadata_schema}" ) for n, (row_self, row_other) in enumerate(zip(self, other)): if ignore_metadata: row_self = dataclasses.replace(row_self, metadata=None) row_other = dataclasses.replace(row_other, metadata=None) if row_self != row_other: self_dict = dataclasses.asdict(self[n]) other_dict = dataclasses.asdict(other[n]) diff_string = [] for col in self_dict.keys(): if isinstance(self_dict[col], np.ndarray): equal = np.array_equal(self_dict[col], other_dict[col]) else: equal = self_dict[col] == other_dict[col] if not equal: diff_string.append( f"self.{col}={self_dict[col]} other.{col}={other_dict[col]}" ) diff_string = "\n".join(diff_string) raise AssertionError( f"{type(self).__name__} row {n} differs:\n{diff_string}" ) if self.num_rows != other.num_rows: raise AssertionError( f"{type(self).__name__} number of rows differ: self={self.num_rows} " f"other={other.num_rows}" ) raise AssertionError( "Tables differ in an undetected way - " "this is a bug, please report an issue on gitub" ) # pragma: no cover def __eq__(self, other): return self.equals(other) def __len__(self): return self.num_rows def __getattr__(self, name): if name in self.column_names: return getattr(self.ll_table, name) else: raise AttributeError( f"{self.__class__.__name__} object has no attribute {name}" ) def __setattr__(self, name, value): if name in self.column_names: d = self.asdict() d[name] = value self.set_columns(**d) else: object.__setattr__(self, name, value) def _make_row(self, *args): return self.row_class(*args) def __getitem__(self, index): """ If passed an integer, return the specified row of this table, decoding metadata if it is present. Supports negative indexing, e.g. ``table[-5]``. If passed a slice, iterable or array return a new table containing the specified rows. Similar to numpy fancy indexing, if the array or iterables contains booleans then the index acts as a mask, returning those rows for which the mask is True. Note that as the result is a new table, the row ids will change as tskit row ids are row indexes. :param index: the index of a desired row, a slice of the desired rows, an iterable or array of the desired row numbers, or a boolean array to use as a mask. """ if isinstance(index, numbers.Integral): # Single row by integer if index < 0: index += len(self) if index < 0 or index >= len(self): raise IndexError("Index out of bounds") return self._make_row(*self.ll_table.get_row(index)) elif isinstance(index, numbers.Number): raise TypeError("Index must be integer, slice or iterable") elif isinstance(index, slice): index = range(*index.indices(len(self))) else: index = np.asarray(index) if index.dtype == np.bool_: if len(index) != len(self): raise IndexError("Boolean index must be same length as table") index = np.flatnonzero(index) index = util.safe_np_int_cast(index, np.int32) ret = self.__class__() ret.metadata_schema = self.metadata_schema ret.ll_table.extend(self.ll_table, row_indexes=index) return ret def __setitem__(self, index, new_row): """ Replaces a row of this table at the specified index with information from a row-like object. Metadata, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.IndividualTable.metadata_schema>`. :param index: the index of the row to change :param row-like new_row: An object that has attributes corresponding to the properties of the new row. Both the objects returned from ``table[i]`` and e.g. ``ts.individual(i)`` work for this purpose, along with any other object with the correct attributes. """ if isinstance(index, numbers.Integral): # Single row by integer if index < 0: index += len(self) if index < 0 or index >= len(self): raise IndexError("Index out of bounds") else: raise TypeError("Index must be integer") row_data = { column: getattr(new_row, column) for column in self.column_names if "_offset" not in column } # Encode the metadata - note that if this becomes a perf bottleneck it is # possible to use the cached, encoded metadata in the row object, rather than # decode and reencode if "metadata" in row_data: row_data["metadata"] = self.metadata_schema.validate_and_encode_row( row_data["metadata"] ) self.ll_table.update_row(row_index=index, **row_data) def append(self, row): """ Adds a new row to this table and returns the ID of the new row. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.IndividualTable.metadata_schema>`. :param row-like row: An object that has attributes corresponding to the properties of the new row. Both the objects returned from ``table[i]`` and e.g. ``ts.individual(i)`` work for this purpose, along with any other object with the correct attributes. :return: The index of the newly added row. :rtype: int """ return self.add_row( **{ column: getattr(row, column) for column in self.column_names if "_offset" not in column } ) def replace_with(self, other): # Overwrite the contents of this table with a copy of the other table self.set_columns(**other.asdict()) def clear(self): """ Deletes all rows in this table. """ self.ll_table.clear() def reset(self): # Deprecated alias for clear self.clear() def truncate(self, num_rows): """ Truncates this table so that the only the first ``num_rows`` are retained. :param int num_rows: The number of rows to retain in this table. """ return self.ll_table.truncate(num_rows) def keep_rows(self, keep): """ .. include:: substitutions/table_keep_rows_main.rst :param array-like keep: The rows to keep as a boolean array. Must be the same length as the table, and convertible to a numpy array of dtype bool. :return: The mapping between old and new row IDs as a numpy array (dtype int32). :rtype: numpy.ndarray (dtype=np.int32) """ # We do this check here rather than in the C code because calling # len() on the input will cause a more readable exception to be # raised than the inscrutable errors we get from numpy when # converting arguments of the wrong type. if len(keep) != len(self): msg = ( "Argument for keep_rows must be a boolean array of " "the same length as the table. " f"(need:{len(self)}, got:{len(keep)})" ) raise ValueError(msg) return self.ll_table.keep_rows(keep) # Pickle support def __getstate__(self): return self.asdict() # Unpickle support def __setstate__(self, state): self.__init__() self.set_columns(**state) def copy(self): """ Returns a deep copy of this table """ copy = self.__class__() copy.set_columns(**self.asdict()) return copy def asdict(self): """ Returns a dictionary mapping the names of the columns in this table to the corresponding numpy arrays. """ ret = {col: getattr(self, col) for col in self.column_names} # Not all tables have metadata try: ret["metadata_schema"] = repr(self.metadata_schema) except AttributeError: pass return ret def set_columns(self, **kwargs): """ Sets the values for each column in this :class:`Table` using values provided in numpy arrays. Overwrites existing data in all the table columns. """ raise NotImplementedError() def __str__(self): headers, rows = self._text_header_and_rows( limit=tskit._print_options["max_lines"] ) return util.unicode_table(rows, header=headers, row_separator=False) def _repr_html_(self): """ Called e.g. by jupyter notebooks to render tables """ headers, rows = self._text_header_and_rows( limit=tskit._print_options["max_lines"] ) return util.html_table(rows, header=headers) def _columns_all_integer(self, *colnames): # For displaying floating point values without loads of decimal places return all( np.all(getattr(self, col) == np.floor(getattr(self, col))) for col in colnames ) class MetadataTable(BaseTable): """ Base class for tables that have a metadata column. """ # TODO this class has some overlap with the MetadataProvider base class # and also the TreeSequence class. These all have methods to deal with # schemas and essentially do the same thing (provide a facade for the # low-level get/set metadata schemas functionality). We should refactor # this so we're only doing it in one place. # https://github.com/tskit-dev/tskit/issues/1957 def __init__(self, ll_table, row_class): super().__init__(ll_table, row_class) def _make_row(self, *args): return self.row_class(*args, metadata_decoder=self.metadata_schema.decode_row) def packset_metadata(self, metadatas): """ Packs the specified list of metadata values and updates the ``metadata`` and ``metadata_offset`` columns. The length of the metadatas array must be equal to the number of rows in the table. :param list metadatas: A list of metadata bytes values. """ packed, offset = util.pack_bytes(metadatas) d = self.asdict() d["metadata"] = packed d["metadata_offset"] = offset self.set_columns(**d) @property def metadata_schema(self) -> metadata.MetadataSchema: """ The :class:`tskit.MetadataSchema` for this table. """ # This isn't as inefficient as it looks because we're using an LRU cache on # the parse_metadata_schema function. Thus, we're really only incurring the # cost of creating the unicode string from the low-level schema and looking # up the functools cache. return metadata.parse_metadata_schema(self.ll_table.metadata_schema) @metadata_schema.setter def metadata_schema(self, schema: metadata.MetadataSchema) -> None: if not isinstance(schema, metadata.MetadataSchema): raise TypeError( "Only instances of tskit.MetadataSchema can be assigned to " f"metadata_schema, not {type(schema)}" ) self.ll_table.metadata_schema = repr(schema) def metadata_vector(self, key, *, dtype=None, default_value=NOTSET): """ Returns a numpy array of metadata values obtained by extracting ``key`` from each metadata entry, and using ``default_value`` if the key is not present. ``key`` may be a list, in which case nested values are returned. For instance, ``key = ["a", "x"]`` will return an array of ``row.metadata["a"]["x"]`` values, iterated over rows in this table. :param str key: The name, or a list of names, of metadata entries. :param str dtype: The dtype of the result (can usually be omitted). :param object default_value: The value to be inserted if the metadata key is not present. Note that for numeric columns, a default value of None will result in a non-numeric array. The default behaviour is to raise ``KeyError`` on missing entries. """ if default_value == NOTSET: def getter(d, k): return d[k] else: def getter(d, k): return ( d.get(k, default_value) if isinstance(d, Mapping) else default_value ) if isinstance(key, list): out = np.array( [ reduce( getter, key, row.metadata, ) for row in self ], dtype=dtype, ) else: out = np.array( [getter(row.metadata, key) for row in self], dtype=dtype, ) return out
[docs]class IndividualTable(MetadataTable): """ A table defining the individuals in a tree sequence. Note that although each Individual has associated nodes, reference to these is not stored in the individual table, but rather reference to the individual is stored for each node in the :class:`NodeTable`. This is similar to the way in which the relationship between sites and mutations is modelled. .. include:: substitutions/table_edit_warning.rst :ivar flags: The array of flags values. :vartype flags: numpy.ndarray, dtype=np.uint32 :ivar location: The flattened array of floating point location values. See :ref:`sec_encoding_ragged_columns` for more details. :vartype location: numpy.ndarray, dtype=np.float64 :ivar location_offset: The array of offsets into the location column. See :ref:`sec_encoding_ragged_columns` for more details. :vartype location_offset: numpy.ndarray, dtype=np.uint32 :ivar parents: The flattened array of parent individual ids. See :ref:`sec_encoding_ragged_columns` for more details. :vartype parents: numpy.ndarray, dtype=np.int32 :ivar parents_offset: The array of offsets into the parents column. See :ref:`sec_encoding_ragged_columns` for more details. :vartype parents_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ "flags", "location", "location_offset", "parents", "parents_offset", "metadata", "metadata_offset", ] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.IndividualTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, IndividualTableRow) def _text_header_and_rows(self, limit=None): headers = ("id", "flags", "location", "parents", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: row = self[j] location_str = ", ".join(map(str, row.location)) parents_str = ", ".join(map(str, row.parents)) rows.append( "{}\t{}\t{}\t{}\t{}".format( j, row.flags, location_str, parents_str, util.render_metadata(row.metadata), ).split("\t") ) return headers, rows
[docs] def add_row(self, flags=0, location=None, parents=None, metadata=None): """ Adds a new row to this :class:`IndividualTable` and returns the ID of the corresponding individual. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.IndividualTable.metadata_schema>`. :param int flags: The bitwise flags for the new node. :param array-like location: A list of numeric values or one-dimensional numpy array describing the location of this individual. If not specified or None, a zero-dimensional location is stored. :param array-like parents: A list or array of ids of parent individuals. If not specified an empty array is stored. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added individual. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row( flags=flags, location=location, parents=parents, metadata=metadata )
[docs] def set_columns( self, flags=None, location=None, location_offset=None, parents=None, parents_offset=None, metadata=None, metadata_offset=None, metadata_schema=None, ): """ Sets the values for each column in this :class:`IndividualTable` using the values in the specified arrays. Overwrites existing data in all the table columns. The ``flags`` array is mandatory and defines the number of individuals the table will contain. The ``location`` and ``location_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. The ``parents`` and ``parents_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each individual. Required. :type flags: numpy.ndarray, dtype=np.uint32 :param location: The flattened location array. Must be specified along with ``location_offset``. If not specified or None, an empty location value is stored for each individual. :type location: numpy.ndarray, dtype=np.float64 :param location_offset: The offsets into the ``location`` array. :type location_offset: numpy.ndarray, dtype=np.uint32. :param parents: The flattened parents array. Must be specified along with ``parents_offset``. If not specified or None, an empty parents array is stored for each individual. :type parents: numpy.ndarray, dtype=np.int32 :param parents_offset: The offsets into the ``parents`` array. :type parents_offset: numpy.ndarray, dtype=np.uint32. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each individual. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self._check_required_args(flags=flags) self.ll_table.set_columns( dict( flags=flags, location=location, location_offset=location_offset, parents=parents, parents_offset=parents_offset, metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns( self, flags=None, location=None, location_offset=None, parents=None, parents_offset=None, metadata=None, metadata_offset=None, ): """ Appends the specified arrays to the end of the columns in this :class:`IndividualTable`. This allows many new rows to be added at once. The ``flags`` array is mandatory and defines the number of extra individuals to add to the table. The ``parents`` and ``parents_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. The ``location`` and ``location_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each individual. Required. :type flags: numpy.ndarray, dtype=np.uint32 :param location: The flattened location array. Must be specified along with ``location_offset``. If not specified or None, an empty location value is stored for each individual. :type location: numpy.ndarray, dtype=np.float64 :param location_offset: The offsets into the ``location`` array. :type location_offset: numpy.ndarray, dtype=np.uint32. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each individual. :param parents: The flattened parents array. Must be specified along with ``parents_offset``. If not specified or None, an empty parents array is stored for each individual. :type parents: numpy.ndarray, dtype=np.int32 :param parents_offset: The offsets into the ``parents`` array. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self._check_required_args(flags=flags) self.ll_table.append_columns( dict( flags=flags, location=location, location_offset=location_offset, parents=parents, parents_offset=parents_offset, metadata=metadata, metadata_offset=metadata_offset, ) )
[docs] def packset_location(self, locations): """ Packs the specified list of location values and updates the ``location`` and ``location_offset`` columns. The length of the locations array must be equal to the number of rows in the table. :param list locations: A list of locations interpreted as numpy float64 arrays. """ packed, offset = util.pack_arrays(locations) d = self.asdict() d["location"] = packed d["location_offset"] = offset self.set_columns(**d)
[docs] def packset_parents(self, parents): """ Packs the specified list of parent values and updates the ``parent`` and ``parent_offset`` columns. The length of the parents array must be equal to the number of rows in the table. :param list parents: A list of list of parent ids, interpreted as numpy int32 arrays. """ packed, offset = util.pack_arrays(parents, np.int32) d = self.asdict() d["parents"] = packed d["parents_offset"] = offset self.set_columns(**d)
[docs] def keep_rows(self, keep): """ .. include:: substitutions/table_keep_rows_main.rst The values in the ``parents`` column are updated according to this map, so that reference integrity within the table is maintained. As a consequence of this, the values in the ``parents`` column for kept rows are bounds-checked and an error raised if they are not valid. Rows that are deleted are not checked for parent ID integrity. If an attempt is made to delete rows that are referred to by the ``parents`` column of rows that are retained, an error is raised. These error conditions are checked before any alterations to the table are made. :param array-like keep: The rows to keep as a boolean array. Must be the same length as the table, and convertible to a numpy array of dtype bool. :return: The mapping between old and new row IDs as a numpy array (dtype int32). :rtype: numpy.ndarray (dtype=np.int32) """ return super().keep_rows(keep)
[docs]class NodeTable(MetadataTable): """ A table defining the nodes in a tree sequence. See the :ref:`definitions <sec_node_table_definition>` for details on the columns in this table and the :ref:`tree sequence requirements <sec_valid_tree_sequence_requirements>` section for the properties needed for a node table to be a part of a valid tree sequence. .. include:: substitutions/table_edit_warning.rst :ivar time: The array of time values. :vartype time: numpy.ndarray, dtype=np.float64 :ivar flags: The array of flags values. :vartype flags: numpy.ndarray, dtype=np.uint32 :ivar population: The array of population IDs. :vartype population: numpy.ndarray, dtype=np.int32 :ivar individual: The array of individual IDs that each node belongs to. :vartype individual: numpy.ndarray, dtype=np.int32 :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ "time", "flags", "population", "individual", "metadata", "metadata_offset", ] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.NodeTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, NodeTableRow) def _text_header_and_rows(self, limit=None): headers = ("id", "flags", "population", "individual", "time", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) decimal_places_times = 0 if self._columns_all_integer("time") else 8 for j in row_indexes: row = self[j] if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: rows.append( "{}\t{}\t{}\t{}\t{:.{dp}f}\t{}".format( j, row.flags, row.population, row.individual, row.time, util.render_metadata(row.metadata), dp=decimal_places_times, ).split("\t") ) return headers, rows
[docs] def add_row(self, flags=0, time=0, population=-1, individual=-1, metadata=None): """ Adds a new row to this :class:`NodeTable` and returns the ID of the corresponding node. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.NodeTable.metadata_schema>`. :param int flags: The bitwise flags for the new node. :param float time: The birth time for the new node. :param int population: The ID of the population in which the new node was born. Defaults to :data:`tskit.NULL`. :param int individual: The ID of the individual in which the new node was born. Defaults to :data:`tskit.NULL`. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added node. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(flags, time, population, individual, metadata)
[docs] def set_columns( self, flags=None, time=None, population=None, individual=None, metadata=None, metadata_offset=None, metadata_schema=None, ): """ Sets the values for each column in this :class:`NodeTable` using the values in the specified arrays. Overwrites existing data in all the table columns. The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes the table will contain. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 :param time: The time values for each node. Required. :type time: numpy.ndarray, dtype=np.float64 :param population: The population values for each node. If not specified or None, the :data:`tskit.NULL` value is stored for each node. :type population: numpy.ndarray, dtype=np.int32 :param individual: The individual values for each node. If not specified or None, the :data:`tskit.NULL` value is stored for each node. :type individual: numpy.ndarray, dtype=np.int32 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self._check_required_args(flags=flags, time=time) self.ll_table.set_columns( dict( flags=flags, time=time, population=population, individual=individual, metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns( self, flags=None, time=None, population=None, individual=None, metadata=None, metadata_offset=None, ): """ Appends the specified arrays to the end of the columns in this :class:`NodeTable`. This allows many new rows to be added at once. The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes that will be added to the table. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 :param time: The time values for each node. Required. :type time: numpy.ndarray, dtype=np.float64 :param population: The population values for each node. If not specified or None, the :data:`tskit.NULL` value is stored for each node. :type population: numpy.ndarray, dtype=np.int32 :param individual: The individual values for each node. If not specified or None, the :data:`tskit.NULL` value is stored for each node. :type individual: numpy.ndarray, dtype=np.int32 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self._check_required_args(flags=flags, time=time) self.ll_table.append_columns( dict( flags=flags, time=time, population=population, individual=individual, metadata=metadata, metadata_offset=metadata_offset, ) )
[docs]class EdgeTable(MetadataTable): """ A table defining the edges in a tree sequence. See the :ref:`definitions <sec_edge_table_definition>` for details on the columns in this table and the :ref:`tree sequence requirements <sec_valid_tree_sequence_requirements>` section for the properties needed for an edge table to be a part of a valid tree sequence. .. include:: substitutions/table_edit_warning.rst :ivar left: The array of left coordinates. :vartype left: numpy.ndarray, dtype=np.float64 :ivar right: The array of right coordinates. :vartype right: numpy.ndarray, dtype=np.float64 :ivar parent: The array of parent node IDs. :vartype parent: numpy.ndarray, dtype=np.int32 :ivar child: The array of child node IDs. :vartype child: numpy.ndarray, dtype=np.int32 :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ "left", "right", "parent", "child", "metadata", "metadata_offset", ] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.EdgeTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, EdgeTableRow) def _text_header_and_rows(self, limit=None): headers = ("id", "left", "right", "parent", "child", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) decimal_places = 0 if self._columns_all_integer("left", "right") else 8 for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: row = self[j] rows.append( "{}\t{:.{dp}f}\t{:.{dp}f}\t{}\t{}\t{}".format( j, row.left, row.right, row.parent, row.child, util.render_metadata(row.metadata), dp=decimal_places, ).split("\t") ) return headers, rows
[docs] def add_row(self, left, right, parent, child, metadata=None): """ Adds a new row to this :class:`EdgeTable` and returns the ID of the corresponding edge. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.EdgeTable.metadata_schema>`. :param float left: The left coordinate (inclusive). :param float right: The right coordinate (exclusive). :param int parent: The ID of parent node. :param int child: The ID of child node. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added edge. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(left, right, parent, child, metadata)
[docs] def set_columns( self, left=None, right=None, parent=None, child=None, metadata=None, metadata_offset=None, metadata_schema=None, ): """ Sets the values for each column in this :class:`EdgeTable` using the values in the specified arrays. Overwrites existing data in all the table columns. The ``left``, ``right``, ``parent`` and ``child`` parameters are mandatory, and must be numpy arrays of the same length (which is equal to the number of edges the table will contain). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). :type left: numpy.ndarray, dtype=np.float64 :param right: The right coordinates (exclusive). :type right: numpy.ndarray, dtype=np.float64 :param parent: The parent node IDs. :type parent: numpy.ndarray, dtype=np.int32 :param child: The child node IDs. :type child: numpy.ndarray, dtype=np.int32 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self._check_required_args(left=left, right=right, parent=parent, child=child) self.ll_table.set_columns( dict( left=left, right=right, parent=parent, child=child, metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns( self, left, right, parent, child, metadata=None, metadata_offset=None ): """ Appends the specified arrays to the end of the columns of this :class:`EdgeTable`. This allows many new rows to be added at once. The ``left``, ``right``, ``parent`` and ``child`` parameters are mandatory, and must be numpy arrays of the same length (which is equal to the number of additional edges to add to the table). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). :type left: numpy.ndarray, dtype=np.float64 :param right: The right coordinates (exclusive). :type right: numpy.ndarray, dtype=np.float64 :param parent: The parent node IDs. :type parent: numpy.ndarray, dtype=np.int32 :param child: The child node IDs. :type child: numpy.ndarray, dtype=np.int32 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.append_columns( dict( left=left, right=right, parent=parent, child=child, metadata=metadata, metadata_offset=metadata_offset, ) )
[docs] def squash(self): """ Sorts, then condenses the table into the smallest possible number of rows by combining any adjacent edges. A pair of edges is said to be `adjacent` if they have the same parent and child nodes, and if the left coordinate of one of the edges is equal to the right coordinate of the other edge. The ``squash`` method modifies an :class:`EdgeTable` in place so that any set of adjacent edges is replaced by a single edge. The new edge will have the same parent and child node, a left coordinate equal to the smallest left coordinate in the set, and a right coordinate equal to the largest right coordinate in the set. The new edge table will be sorted in the order (P, C, L, R): if the node table is ordered by increasing node time, as is common, this order will meet the :ref:`sec_edge_requirements` for a valid tree sequence, otherwise you will need to call :meth:`.sort` on the entire :class:`TableCollection`. .. note:: Note that this method will fail if any edges have non-empty metadata. """ self.ll_table.squash()
[docs]class MigrationTable(MetadataTable): """ A table defining the migrations in a tree sequence. See the :ref:`definitions <sec_migration_table_definition>` for details on the columns in this table and the :ref:`tree sequence requirements <sec_valid_tree_sequence_requirements>` section for the properties needed for a migration table to be a part of a valid tree sequence. .. include:: substitutions/table_edit_warning.rst :ivar left: The array of left coordinates. :vartype left: numpy.ndarray, dtype=np.float64 :ivar right: The array of right coordinates. :vartype right: numpy.ndarray, dtype=np.float64 :ivar node: The array of node IDs. :vartype node: numpy.ndarray, dtype=np.int32 :ivar source: The array of source population IDs. :vartype source: numpy.ndarray, dtype=np.int32 :ivar dest: The array of destination population IDs. :vartype dest: numpy.ndarray, dtype=np.int32 :ivar time: The array of time values. :vartype time: numpy.ndarray, dtype=np.float64 :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ "left", "right", "node", "source", "dest", "time", "metadata", "metadata_offset", ] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.MigrationTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, MigrationTableRow) def _text_header_and_rows(self, limit=None): headers = ("id", "left", "right", "node", "source", "dest", "time", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) decimal_places_coords = 0 if self._columns_all_integer("left", "right") else 8 decimal_places_times = 0 if self._columns_all_integer("time") else 8 for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: row = self[j] rows.append( "{}\t{:.{dp_c}f}\t{:.{dp_c}f}\t{}\t{}\t{}\t{:.{dp_t}f}\t{}".format( j, row.left, row.right, row.node, row.source, row.dest, row.time, util.render_metadata(row.metadata), dp_c=decimal_places_coords, dp_t=decimal_places_times, ).split("\t") ) return headers, rows
[docs] def add_row(self, left, right, node, source, dest, time, metadata=None): """ Adds a new row to this :class:`MigrationTable` and returns the ID of the corresponding migration. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.MigrationTable.metadata_schema>`. :param float left: The left coordinate (inclusive). :param float right: The right coordinate (exclusive). :param int node: The node ID. :param int source: The ID of the source population. :param int dest: The ID of the destination population. :param float time: The time of the migration event. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added migration. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(left, right, node, source, dest, time, metadata)
[docs] def set_columns( self, left=None, right=None, node=None, source=None, dest=None, time=None, metadata=None, metadata_offset=None, metadata_schema=None, ): """ Sets the values for each column in this :class:`MigrationTable` using the values in the specified arrays. Overwrites existing data in all the table columns. All parameters except ``metadata`` and ``metadata_offset`` and are mandatory, and must be numpy arrays of the same length (which is equal to the number of migrations the table will contain). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). :type left: numpy.ndarray, dtype=np.float64 :param right: The right coordinates (exclusive). :type right: numpy.ndarray, dtype=np.float64 :param node: The node IDs. :type node: numpy.ndarray, dtype=np.int32 :param source: The source population IDs. :type source: numpy.ndarray, dtype=np.int32 :param dest: The destination population IDs. :type dest: numpy.ndarray, dtype=np.int32 :param time: The time of each migration. :type time: numpy.ndarray, dtype=np.int64 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each migration. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self._check_required_args( left=left, right=right, node=node, source=source, dest=dest, time=time ) self.ll_table.set_columns( dict( left=left, right=right, node=node, source=source, dest=dest, time=time, metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns( self, left, right, node, source, dest, time, metadata=None, metadata_offset=None, ): """ Appends the specified arrays to the end of the columns of this :class:`MigrationTable`. This allows many new rows to be added at once. All parameters except ``metadata`` and ``metadata_offset`` and are mandatory, and must be numpy arrays of the same length (which is equal to the number of additional migrations to add to the table). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` for more information and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). :type left: numpy.ndarray, dtype=np.float64 :param right: The right coordinates (exclusive). :type right: numpy.ndarray, dtype=np.float64 :param node: The node IDs. :type node: numpy.ndarray, dtype=np.int32 :param source: The source population IDs. :type source: numpy.ndarray, dtype=np.int32 :param dest: The destination population IDs. :type dest: numpy.ndarray, dtype=np.int32 :param time: The time of each migration. :type time: numpy.ndarray, dtype=np.int64 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each migration. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.append_columns( dict( left=left, right=right, node=node, source=source, dest=dest, time=time, metadata=metadata, metadata_offset=metadata_offset, ) )
[docs]class SiteTable(MetadataTable): """ A table defining the sites in a tree sequence. See the :ref:`definitions <sec_site_table_definition>` for details on the columns in this table and the :ref:`tree sequence requirements <sec_valid_tree_sequence_requirements>` section for the properties needed for a site table to be a part of a valid tree sequence. .. include:: substitutions/table_edit_warning.rst :ivar position: The array of site position coordinates. :vartype position: numpy.ndarray, dtype=np.float64 :ivar ancestral_state: The flattened array of ancestral state strings. See :ref:`sec_tables_api_text_columns` for more details. :vartype ancestral_state: numpy.ndarray, dtype=np.int8 :ivar ancestral_state_offset: The offsets of rows in the ancestral_state array. See :ref:`sec_tables_api_text_columns` for more details. :vartype ancestral_state_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ "position", "ancestral_state", "ancestral_state_offset", "metadata", "metadata_offset", ] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.SiteTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, SiteTableRow) def _text_header_and_rows(self, limit=None): headers = ("id", "position", "ancestral_state", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) decimal_places = 0 if self._columns_all_integer("position") else 8 for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: row = self[j] rows.append( "{}\t{:.{dp}f}\t{}\t{}".format( j, row.position, row.ancestral_state, util.render_metadata(row.metadata), dp=decimal_places, ).split("\t") ) return headers, rows
[docs] def add_row(self, position, ancestral_state, metadata=None): """ Adds a new row to this :class:`SiteTable` and returns the ID of the corresponding site. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.SiteTable.metadata_schema>`. :param float position: The position of this site in genome coordinates. :param str ancestral_state: The state of this site at the root of the tree. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added site. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(position, ancestral_state, metadata)
[docs] def set_columns( self, position=None, ancestral_state=None, ancestral_state_offset=None, metadata=None, metadata_offset=None, metadata_schema=None, ): """ Sets the values for each column in this :class:`SiteTable` using the values in the specified arrays. Overwrites existing data in all the table columns. The ``position``, ``ancestral_state`` and ``ancestral_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The length of the ``position`` array determines the number of rows in table. The ``ancestral_state`` and ``ancestral_state_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_text_columns` for more information). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information) and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param position: The position of each site in genome coordinates. :type position: numpy.ndarray, dtype=np.float64 :param ancestral_state: The flattened ancestral_state array. Required. :type ancestral_state: numpy.ndarray, dtype=np.int8 :param ancestral_state_offset: The offsets into the ``ancestral_state`` array. :type ancestral_state_offset: numpy.ndarray, dtype=np.uint32. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self._check_required_args( position=position, ancestral_state=ancestral_state, ancestral_state_offset=ancestral_state_offset, ) self.ll_table.set_columns( dict( position=position, ancestral_state=ancestral_state, ancestral_state_offset=ancestral_state_offset, metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns( self, position, ancestral_state, ancestral_state_offset, metadata=None, metadata_offset=None, ): """ Appends the specified arrays to the end of the columns of this :class:`SiteTable`. This allows many new rows to be added at once. The ``position``, ``ancestral_state`` and ``ancestral_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The length of the ``position`` array determines the number of additional rows to add the table. The ``ancestral_state`` and ``ancestral_state_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_text_columns` for more information). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information) and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param position: The position of each site in genome coordinates. :type position: numpy.ndarray, dtype=np.float64 :param ancestral_state: The flattened ancestral_state array. Required. :type ancestral_state: numpy.ndarray, dtype=np.int8 :param ancestral_state_offset: The offsets into the ``ancestral_state`` array. :type ancestral_state_offset: numpy.ndarray, dtype=np.uint32. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.append_columns( dict( position=position, ancestral_state=ancestral_state, ancestral_state_offset=ancestral_state_offset, metadata=metadata, metadata_offset=metadata_offset, ) )
[docs] def packset_ancestral_state(self, ancestral_states): """ Packs the specified list of ancestral_state values and updates the ``ancestral_state`` and ``ancestral_state_offset`` columns. The length of the ancestral_states array must be equal to the number of rows in the table. :param list(str) ancestral_states: A list of string ancestral state values. """ packed, offset = util.pack_strings(ancestral_states) d = self.asdict() d["ancestral_state"] = packed d["ancestral_state_offset"] = offset self.set_columns(**d)
[docs]class MutationTable(MetadataTable): """ A table defining the mutations in a tree sequence. See the :ref:`definitions <sec_mutation_table_definition>` for details on the columns in this table and the :ref:`tree sequence requirements <sec_valid_tree_sequence_requirements>` section for the properties needed for a mutation table to be a part of a valid tree sequence. .. include:: substitutions/table_edit_warning.rst :ivar site: The array of site IDs. :vartype site: numpy.ndarray, dtype=np.int32 :ivar node: The array of node IDs. :vartype node: numpy.ndarray, dtype=np.int32 :ivar time: The array of time values. :vartype time: numpy.ndarray, dtype=np.float64 :ivar derived_state: The flattened array of derived state strings. See :ref:`sec_tables_api_text_columns` for more details. :vartype derived_state: numpy.ndarray, dtype=np.int8 :ivar derived_state_offset: The offsets of rows in the derived_state array. See :ref:`sec_tables_api_text_columns` for more details. :vartype derived_state_offset: numpy.ndarray, dtype=np.uint32 :ivar parent: The array of parent mutation IDs. :vartype parent: numpy.ndarray, dtype=np.int32 :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ "site", "node", "time", "derived_state", "derived_state_offset", "parent", "metadata", "metadata_offset", ] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.MutationTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, MutationTableRow) def _text_header_and_rows(self, limit=None): headers = ("id", "site", "node", "time", "derived_state", "parent", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) # Currently mutations do not have discretised times: this for consistency decimal_places_times = 0 if self._columns_all_integer("time") else 8 for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: row = self[j] rows.append( "{}\t{}\t{}\t{:.{dp}f}\t{}\t{}\t{}".format( j, row.site, row.node, row.time, row.derived_state, row.parent, util.render_metadata(row.metadata), dp=decimal_places_times, ).split("\t") ) return headers, rows
[docs] def add_row(self, site, node, derived_state, parent=-1, metadata=None, time=None): """ Adds a new row to this :class:`MutationTable` and returns the ID of the corresponding mutation. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.MutationTable.metadata_schema>`. :param int site: The ID of the site that this mutation occurs at. :param int node: The ID of the first node inheriting this mutation. :param str derived_state: The state of the site at this mutation's node. :param int parent: The ID of the parent mutation. If not specified, defaults to :attr:`NULL`. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added mutation. :param float time: The occurrence time for the new mutation. If not specified, defaults to ``UNKNOWN_TIME``, indicating the time is unknown. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row( site, node, derived_state, parent, metadata, UNKNOWN_TIME if time is None else time, )
[docs] def set_columns( self, site=None, node=None, time=None, derived_state=None, derived_state_offset=None, parent=None, metadata=None, metadata_offset=None, metadata_schema=None, ): """ Sets the values for each column in this :class:`MutationTable` using the values in the specified arrays. Overwrites existing data in all the the table columns. The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The ``site`` and ``node`` (also ``parent`` and ``time``, if supplied) arrays must be of equal length, and determine the number of rows in the table. The ``derived_state`` and ``derived_state_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_text_columns` for more information). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information) and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param site: The ID of the site each mutation occurs at. :type site: numpy.ndarray, dtype=np.int32 :param node: The ID of the node each mutation is associated with. :type node: numpy.ndarray, dtype=np.int32 :param time: The time values for each mutation. :type time: numpy.ndarray, dtype=np.float64 :param derived_state: The flattened derived_state array. Required. :type derived_state: numpy.ndarray, dtype=np.int8 :param derived_state_offset: The offsets into the ``derived_state`` array. :type derived_state_offset: numpy.ndarray, dtype=np.uint32. :param parent: The ID of the parent mutation for each mutation. :type parent: numpy.ndarray, dtype=np.int32 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self._check_required_args( site=site, node=node, derived_state=derived_state, derived_state_offset=derived_state_offset, ) self.ll_table.set_columns( dict( site=site, node=node, parent=parent, time=time, derived_state=derived_state, derived_state_offset=derived_state_offset, metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns( self, site, node, derived_state, derived_state_offset, parent=None, time=None, metadata=None, metadata_offset=None, ): """ Appends the specified arrays to the end of the columns of this :class:`MutationTable`. This allows many new rows to be added at once. The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The ``site`` and ``node`` (also ``time`` and ``parent``, if supplied) arrays must be of equal length, and determine the number of additional rows to add to the table. The ``derived_state`` and ``derived_state_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_text_columns` for more information). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information) and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param site: The ID of the site each mutation occurs at. :type site: numpy.ndarray, dtype=np.int32 :param node: The ID of the node each mutation is associated with. :type node: numpy.ndarray, dtype=np.int32 :param time: The time values for each mutation. :type time: numpy.ndarray, dtype=np.float64 :param derived_state: The flattened derived_state array. Required. :type derived_state: numpy.ndarray, dtype=np.int8 :param derived_state_offset: The offsets into the ``derived_state`` array. :type derived_state_offset: numpy.ndarray, dtype=np.uint32. :param parent: The ID of the parent mutation for each mutation. :type parent: numpy.ndarray, dtype=np.int32 :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.append_columns( dict( site=site, node=node, time=time, parent=parent, derived_state=derived_state, derived_state_offset=derived_state_offset, metadata=metadata, metadata_offset=metadata_offset, ) )
[docs] def packset_derived_state(self, derived_states): """ Packs the specified list of derived_state values and updates the ``derived_state`` and ``derived_state_offset`` columns. The length of the derived_states array must be equal to the number of rows in the table. :param list(str) derived_states: A list of string derived state values. """ packed, offset = util.pack_strings(derived_states) d = self.asdict() d["derived_state"] = packed d["derived_state_offset"] = offset self.set_columns(**d)
[docs] def keep_rows(self, keep): """ .. include:: substitutions/table_keep_rows_main.rst The values in the ``parent`` column are updated according to this map, so that reference integrity within the table is maintained. As a consequence of this, the values in the ``parent`` column for kept rows are bounds-checked and an error raised if they are not valid. Rows that are deleted are not checked for parent ID integrity. If an attempt is made to delete rows that are referred to by the ``parent`` column of rows that are retained, an error is raised. These error conditions are checked before any alterations to the table are made. :param array-like keep: The rows to keep as a boolean array. Must be the same length as the table, and convertible to a numpy array of dtype bool. :return: The mapping between old and new row IDs as a numpy array (dtype int32). :rtype: numpy.ndarray (dtype=np.int32) """ return super().keep_rows(keep)
[docs]class PopulationTable(MetadataTable): """ A table defining the populations referred to in a tree sequence. The PopulationTable stores metadata for populations that may be referred to in the NodeTable and MigrationTable". Note that although nodes may be associated with populations, this association is stored in the :class:`NodeTable`: only metadata on each population is stored in the population table. .. include:: substitutions/table_edit_warning.rst :ivar metadata: The flattened array of binary metadata values. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 :ivar metadata_schema: The metadata schema for this table's metadata column :vartype metadata_schema: tskit.MetadataSchema """ column_names = ["metadata", "metadata_offset"] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.PopulationTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, PopulationTableRow)
[docs] def add_row(self, metadata=None): """ Adds a new row to this :class:`PopulationTable` and returns the ID of the corresponding population. Metadata, if specified, will be validated and encoded according to the table's :attr:`metadata_schema<tskit.PopulationTable.metadata_schema>`. :param object metadata: Any object that is valid metadata for the table's schema. Defaults to the default metadata value for the table's schema. This is typically ``{}``. For no schema, ``None``. :return: The ID of the newly added population. :rtype: int """ if metadata is None: metadata = self.metadata_schema.empty_value metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(metadata=metadata)
def _text_header_and_rows(self, limit=None): headers = ("id", "metadata") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: rows.append((str(j), util.render_metadata(self[j].metadata, length=70))) return headers, rows
[docs] def set_columns(self, metadata=None, metadata_offset=None, metadata_schema=None): """ Sets the values for each column in this :class:`PopulationTable` using the values in the specified arrays. Overwrites existing data in all the table columns. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information) and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. :param metadata_schema: The encoded metadata schema. If None (default) do not overwrite the exising schema. Note that a schema will need to be encoded as a string, e.g. via ``repr(new_metadata_schema)``. :type metadata_schema: str """ self.ll_table.set_columns( dict( metadata=metadata, metadata_offset=metadata_offset, metadata_schema=metadata_schema, ) )
[docs] def append_columns(self, metadata=None, metadata_offset=None): """ Appends the specified arrays to the end of the columns of this :class:`PopulationTable`. This allows many new rows to be added at once. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information) and :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata value is stored for each node. :type metadata: numpy.ndarray, dtype=np.int8 :param metadata_offset: The offsets into the ``metadata`` array. :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.append_columns( dict(metadata=metadata, metadata_offset=metadata_offset) )
[docs]class ProvenanceTable(BaseTable): """ A table recording the provenance (i.e., history) of this table, so that the origin of the underlying data and sequence of subsequent operations can be traced. Each row contains a "record" string (recommended format: JSON) and a timestamp. .. todo:: The format of the `record` field will be more precisely specified in the future. :ivar record: The flattened array containing the record strings. :ref:`sec_tables_api_text_columns` for more details. :vartype record: numpy.ndarray, dtype=np.int8 :ivar record_offset: The array of offsets into the record column. See :ref:`sec_tables_api_text_columns` for more details. :vartype record_offset: numpy.ndarray, dtype=np.uint32 :ivar timestamp: The flattened array containing the timestamp strings. :ref:`sec_tables_api_text_columns` for more details. :vartype timestamp: numpy.ndarray, dtype=np.int8 :ivar timestamp_offset: The array of offsets into the timestamp column. See :ref:`sec_tables_api_text_columns` for more details. :vartype timestamp_offset: numpy.ndarray, dtype=np.uint32 """ column_names = ["record", "record_offset", "timestamp", "timestamp_offset"] def __init__(self, max_rows_increment=0, ll_table=None): if ll_table is None: ll_table = _tskit.ProvenanceTable(max_rows_increment=max_rows_increment) super().__init__(ll_table, ProvenanceTableRow)
[docs] def equals(self, other, ignore_timestamps=False): """ Returns True if `self` and `other` are equal. By default, two provenance tables are considered equal if their columns are byte-for-byte identical. :param other: Another provenance table instance :param bool ignore_timestamps: If True exclude the timestamp column from the comparison. :return: True if other is equal to this provenance table; False otherwise. :rtype: bool """ ret = False if type(other) is type(self): ret = bool( self.ll_table.equals( other.ll_table, ignore_timestamps=ignore_timestamps ) ) return ret
[docs] def assert_equals(self, other, *, ignore_timestamps=False): """ Raise an AssertionError for the first found difference between this and another provenance table. :param other: Another provenance table instance :param bool ignore_timestamps: If True exclude the timestamp column from the comparison. """ if type(other) is not type(self): raise AssertionError(f"Types differ: self={type(self)} other={type(other)}") # Check using the low-level method to avoid slowly going through everything if self.equals(other, ignore_timestamps=ignore_timestamps): return for n, (row_self, row_other) in enumerate(zip(self, other)): if ignore_timestamps: row_self = dataclasses.replace(row_self, timestamp=None) row_other = dataclasses.replace(row_other, timestamp=None) if row_self != row_other: self_dict = dataclasses.asdict(self[n]) other_dict = dataclasses.asdict(other[n]) diff_string = [] for col in self_dict.keys(): if self_dict[col] != other_dict[col]: diff_string.append( f"self.{col}={self_dict[col]} other.{col}={other_dict[col]}" ) diff_string = "\n".join(diff_string) raise AssertionError( f"{type(self).__name__} row {n} differs:\n{diff_string}" ) if self.num_rows != other.num_rows: raise AssertionError( f"{type(self).__name__} number of rows differ: self={self.num_rows} " f"other={other.num_rows}" ) raise AssertionError( "Tables differ in an undetected way - " "this is a bug, please report an issue on gitub" ) # pragma: no cover
[docs] def add_row(self, record, timestamp=None): """ Adds a new row to this ProvenanceTable consisting of the specified record and timestamp. If timestamp is not specified, it is automatically generated from the current time. :param str record: A provenance record, describing the parameters and environment used to generate the current set of tables. :param str timestamp: A string timestamp. This should be in ISO8601 form. """ if timestamp is None: timestamp = datetime.datetime.now().isoformat() # Note that the order of the positional arguments has been reversed # from the low-level module, which is a bit confusing. However, we # want the default behaviour here to be to add a row to the table at # the current time as simply as possible. return self.ll_table.add_row(record=record, timestamp=timestamp)
[docs] def set_columns( self, timestamp=None, timestamp_offset=None, record=None, record_offset=None ): """ Sets the values for each column in this :class:`ProvenanceTable` using the values in the specified arrays. Overwrites existing data in all the table columns. The ``timestamp`` and ``timestamp_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information). Likewise for the ``record`` and ``record_offset`` columns :param timestamp: The flattened timestamp array. Must be specified along with ``timestamp_offset``. If not specified or None, an empty timestamp value is stored for each node. :type timestamp: numpy.ndarray, dtype=np.int8 :param timestamp_offset: The offsets into the ``timestamp`` array. :type timestamp_offset: numpy.ndarray, dtype=np.uint32. :param record: The flattened record array. Must be specified along with ``record_offset``. If not specified or None, an empty record value is stored for each node. :type record: numpy.ndarray, dtype=np.int8 :param record_offset: The offsets into the ``record`` array. :type record_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.set_columns( dict( timestamp=timestamp, timestamp_offset=timestamp_offset, record=record, record_offset=record_offset, ) )
[docs] def append_columns( self, timestamp=None, timestamp_offset=None, record=None, record_offset=None ): """ Appends the specified arrays to the end of the columns of this :class:`ProvenanceTable`. This allows many new rows to be added at once. The ``timestamp`` and ``timestamp_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see :ref:`sec_tables_api_binary_columns` for more information). Likewise for the ``record`` and ``record_offset`` columns :param timestamp: The flattened timestamp array. Must be specified along with ``timestamp_offset``. If not specified or None, an empty timestamp value is stored for each node. :type timestamp: numpy.ndarray, dtype=np.int8 :param timestamp_offset: The offsets into the ``timestamp`` array. :type timestamp_offset: numpy.ndarray, dtype=np.uint32. :param record: The flattened record array. Must be specified along with ``record_offset``. If not specified or None, an empty record value is stored for each node. :type record: numpy.ndarray, dtype=np.int8 :param record_offset: The offsets into the ``record`` array. :type record_offset: numpy.ndarray, dtype=np.uint32. """ self.ll_table.append_columns( dict( timestamp=timestamp, timestamp_offset=timestamp_offset, record=record, record_offset=record_offset, ) )
def _text_header_and_rows(self, limit=None): headers = ("id", "timestamp", "record") rows = [] row_indexes = util.truncate_rows(self.num_rows, limit) for j in row_indexes: if j == -1: rows.append(f"__skipped__{self.num_rows-limit}") else: row = self[j] rows.append( ( str(j), str(row.timestamp), util.truncate_string_end(str(row.record), length=60), ) ) return headers, rows
[docs] def packset_record(self, records): """ Packs the specified list of record values and updates the ``record`` and ``record_offset`` columns. The length of the records array must be equal to the number of rows in the table. :param list(str) records: A list of string record values. """ packed, offset = util.pack_strings(records) d = self.asdict() d["record"] = packed d["record_offset"] = offset self.set_columns(**d)
[docs] def packset_timestamp(self, timestamps): """ Packs the specified list of timestamp values and updates the ``timestamp`` and ``timestamp_offset`` columns. The length of the timestamps array must be equal to the number of rows in the table. :param list(str) timestamps: A list of string timestamp values. """ packed, offset = util.pack_strings(timestamps) d = self.asdict() d["timestamp"] = packed d["timestamp_offset"] = offset self.set_columns(**d)
# We define segment ordering by (left, right, node) tuples
[docs]@dataclasses.dataclass(eq=True, order=True) class IdentitySegment: """ A single segment of identity spanning a genomic interval for a a specific ancestor node. """ left: float """The left genomic coordinate (inclusive).""" right: float """The right genomic coordinate (exclusive).""" node: int """The ID of the most recent common ancestor node.""" @property def span(self) -> float: """ The length of the genomic region spanned by this identity segment. """ return self.right - self.left
[docs]class IdentitySegmentList(collections.abc.Iterable, collections.abc.Sized): """ A summary of identity segments for some pair of samples in a :class:`.IdentitySegments` result. If the ``store_segments`` argument has been specified to :meth:`.TreeSequence.ibd_segments`, this class can be treated as a sequence of :class:`.IdentitySegment` objects. Access to the segment data via numpy arrays is also available via the :attr:`.IdentitySegmentList.left`, :attr:`.IdentitySegmentList.right` and :attr:`.IdentitySegmentList.node` attributes. If ``store_segments`` is False, only the overall summary values such as :attr:`.IdentitySegmentList.total_span` and ``len()`` are available. .. warning:: The order of segments within an IdentitySegmentList is arbitrary and may change in the future """ def __init__(self, ll_segment_list): self._ll_segment_list = ll_segment_list def __iter__(self): for left, right, node in zip(self.left, self.right, self.node): yield IdentitySegment(float(left), float(right), int(node)) def __len__(self): return self._ll_segment_list.num_segments def __str__(self): return ( f"IdentitySegmentList(num_segments={len(self)}, " f"total_span={self.total_span})" ) def __repr__(self): return f"IdentitySegmentList({repr(list(self))})" def __eq__(self, other): if not isinstance(other, IdentitySegmentList): return False return list(self) == list(other) @property def total_span(self): """ The total genomic span covered by segments in this list. Equal to ``sum(seg.span for seg in seglst)``. """ return self._ll_segment_list.total_span @property def left(self): """ A numpy array (dtype=np.float64) of the ``left`` coordinates of segments. """ return self._ll_segment_list.left @property def right(self): """ A numpy array (dtype=np.float64) of the ``right`` coordinates of segments. """ return self._ll_segment_list.right @property def node(self): """ A numpy array (dtype=np.int32) of the MRCA node IDs in segments. """ return self._ll_segment_list.node
[docs]class IdentitySegments(collections.abc.Mapping): """ A class summarising and optionally storing the segments of identity by state returned by :meth:`.TreeSequence.ibd_segments`. See the :ref:`sec_identity` for more information and examples. Along with the documented methods and attributes, the class supports the Python mapping protocol, and can be regarded as a dictionary mapping sample node pair tuples to the corresponding :class:`.IdentitySegmentList`. .. note:: It is important to note that the facilities available for a given instance of this class are determined by the ``store_pairs`` and ``store_segments`` arguments provided to :meth:`.TreeSequence.ibd_segments`. For example, attempting to access per-sample pair information if ``store_pairs`` is False will result in a (hopefully informative) error being raised. .. warning:: This class should not be instantiated directly. """ def __init__(self, ll_result, *, max_time, min_span, store_segments, store_pairs): self._ll_identity_segments = ll_result self.max_time = max_time self.min_span = min_span self.store_segments = store_segments self.store_pairs = store_pairs @property def num_segments(self): """ The total number of identity segments found. """ return self._ll_identity_segments.num_segments @property def num_pairs(self): """ The total number of distinct sample pairs for which identity segments were found. (Only available when ``store_pairs`` or ``store_segments`` is specified). """ return self._ll_identity_segments.num_pairs @property def total_span(self): """ The total genomic sequence length spanned by all identity segments that were found. """ return self._ll_identity_segments.total_span @property def pairs(self): """ A numpy array with shape ``(segs.num_pairs, 2)`` and dtype=np.int32 containing the sample pairs for which IBD segments were found. """ return self._ll_identity_segments.get_keys() # We have two different versions of repr - one where we list out the segments # for debugging, and the other that just shows the standard representation. # We could have repr fail if store_segments isn't true, but then printing, # e.g., a list of IdentitySegments objects would fail unexpectedly. def __repr__(self): if self.store_segments: return f"IdentitySegments({dict(self)})" return super().__repr__() def __str__(self): # TODO it would be nice to add horizontal lines as # table separators to distinguish the two parts of the # table like suggested here: # https://github.com/tskit-dev/tskit/pull/1902#issuecomment-989943424 rows = [ ["Parameters:", ""], ["max_time", str(self.max_time)], ["min_span", str(self.min_span)], ["store_pairs", str(self.store_pairs)], ["store_segments", str(self.store_segments)], ["Results:", ""], ["num_segments", str(self.num_segments)], ["total_span", str(self.total_span)], ] if self.store_pairs: rows.append(["num_pairs", str(len(self))]) return util.unicode_table(rows, title="IdentitySegments", row_separator=False) def __getitem__(self, key): sample_a, sample_b = key return IdentitySegmentList(self._ll_identity_segments.get(sample_a, sample_b)) def __iter__(self): return map(tuple, self._ll_identity_segments.get_keys()) def __len__(self): return self.num_pairs
# TODO move to reference_sequence.py when we start adding more functionality.
[docs]class ReferenceSequence(metadata.MetadataProvider): """ The :ref:`reference sequence<sec_data_model_reference_sequence>` associated with a given :class:`.TableCollection` or :class:`.TreeSequence`. Metadata concerning reference sequences can be described using the :attr:`.metadata_schema` and stored in the :attr:`.metadata` attribute. See the :ref:`examples<sec_metadata_examples_reference_sequence>` for idiomatic usage. .. warning:: This API is preliminary and currently only supports accessing reference sequence information via the ``.data`` attribute. Future versions will also enable transparent fetching of known reference sequences from a URL (see https://github.com/tskit-dev/tskit/issues/2022). """ def __init__(self, ll_reference_sequence): super().__init__(ll_reference_sequence) self._ll_reference_sequence = ll_reference_sequence
[docs] def is_null(self) -> bool: """ Returns True if this :class:`.ReferenceSequence` is null, i.e., all fields are empty. """ return bool(self._ll_reference_sequence.is_null())
def clear(self): self.data = "" self.url = "" self.metadata_schema = tskit.MetadataSchema(None) self.metadata = b"" # https://github.com/tskit-dev/tskit/issues/1984 # TODO add a __str__ method # TODO add a _repr_html_ # FIXME This is a shortcut, we want to put the values in explicitly # here to get more control over how they are displayed. def __repr__(self): return f"ReferenceSequence({repr(self.asdict())})" @property def data(self) -> str: """ The string encoding of the reference sequence such that ``data[j]`` represents the reference nucleotide at base ``j``. If this reference sequence is writable, the value can be assigned, e.g. ``tables.reference_sequence.data = "ACGT"`` """ return self._ll_reference_sequence.data @data.setter def data(self, value): self._ll_reference_sequence.data = value @property def url(self) -> str: return self._ll_reference_sequence.url @url.setter def url(self, value): self._ll_reference_sequence.url = value def asdict(self) -> dict: return { "metadata_schema": repr(self.metadata_schema), "metadata": self.metadata_bytes, "data": self.data, "url": self.url, } def __eq__(self, other): return self.equals(other) def equals(self, other, ignore_metadata=False): try: self.assert_equals(other, ignore_metadata) return True except AssertionError: return False def assert_equals(self, other, ignore_metadata=False): if not ignore_metadata: super().assert_equals(other) if self.data != other.data: raise AssertionError( f"Reference sequence data differs: self={self.data} " f"other={other.data}" ) if self.url != other.url: raise AssertionError( f"Reference sequence url differs: self={self.url} " f"other={other.url}" ) @property def nbytes(self): # TODO this will be inefficient when we work with large references. # Make a dedicated low-level method for getting the length of data. return super().nbytes + len(self.url) + len(self.data)
[docs]class TableCollection(metadata.MetadataProvider): """ A collection of mutable tables defining a tree sequence. See the :ref:`sec_data_model` section for definition on the various tables and how they together define a :class:`TreeSequence`. Arbitrary data can be stored in a TableCollection, but there are certain :ref:`requirements <sec_valid_tree_sequence_requirements>` that must be satisfied for these tables to be interpreted as a tree sequence. To obtain an immutable :class:`TreeSequence` instance corresponding to the current state of a ``TableCollection``, please use the :meth:`.tree_sequence` method. """ set_err_text = ( "Cannot set tables in a table collection: use table.replace_with() instead." ) def __init__(self, sequence_length=0, *, ll_tables=None): self._ll_tables = ll_tables if ll_tables is None: self._ll_tables = _tskit.TableCollection(sequence_length) super().__init__(self._ll_tables) self._individuals = IndividualTable(ll_table=self._ll_tables.individuals) self._nodes = NodeTable(ll_table=self._ll_tables.nodes) self._edges = EdgeTable(ll_table=self._ll_tables.edges) self._migrations = MigrationTable(ll_table=self._ll_tables.migrations) self._sites = SiteTable(ll_table=self._ll_tables.sites) self._mutations = MutationTable(ll_table=self._ll_tables.mutations) self._populations = PopulationTable(ll_table=self._ll_tables.populations) self._provenances = ProvenanceTable(ll_table=self._ll_tables.provenances) @property def individuals(self) -> IndividualTable: """ The :ref:`sec_individual_table_definition` in this collection. """ return self._individuals @individuals.setter def individuals(self, value): raise AttributeError(self.set_err_text) @property def nodes(self) -> NodeTable: """ The :ref:`sec_node_table_definition` in this collection. """ return self._nodes @nodes.setter def nodes(self, value): raise AttributeError(self.set_err_text) @property def edges(self) -> EdgeTable: """ The :ref:`sec_edge_table_definition` in this collection. """ return self._edges @edges.setter def edges(self, value): raise AttributeError(self.set_err_text) @property def migrations(self) -> MigrationTable: """ The :ref:`sec_migration_table_definition` in this collection """ return self._migrations @migrations.setter def migrations(self, value): raise AttributeError(self.set_err_text) @property def sites(self) -> SiteTable: """ The :ref:`sec_site_table_definition` in this collection. """ return self._sites @sites.setter def sites(self, value): raise AttributeError(self.set_err_text) @property def mutations(self) -> MutationTable: """ The :ref:`sec_mutation_table_definition` in this collection. """ return self._mutations @mutations.setter def mutations(self, value): raise AttributeError(self.set_err_text) @property def populations(self) -> PopulationTable: """ The :ref:`sec_population_table_definition` in this collection. """ return self._populations @populations.setter def populations(self, value): raise AttributeError(self.set_err_text) @property def provenances(self) -> ProvenanceTable: """ The :ref:`sec_provenance_table_definition` in this collection. """ return self._provenances @provenances.setter def provenances(self, value): raise AttributeError(self.set_err_text) @property def indexes(self) -> TableCollectionIndexes: """ The edge insertion and removal indexes. """ indexes = self._ll_tables.indexes return TableCollectionIndexes(**indexes) @indexes.setter def indexes(self, indexes): self._ll_tables.indexes = indexes.asdict() @property def sequence_length(self) -> float: """ The sequence length defining the coordinate space. """ return self._ll_tables.sequence_length @sequence_length.setter def sequence_length(self, sequence_length): self._ll_tables.sequence_length = sequence_length @property def file_uuid(self) -> str: """ The UUID for the file this TableCollection is derived from, or None if not derived from a file. """ return self._ll_tables.file_uuid @property def time_units(self) -> str: """ The units used for the time dimension of this TableCollection """ return self._ll_tables.time_units @time_units.setter def time_units(self, time_units: str) -> None: self._ll_tables.time_units = time_units
[docs] def has_reference_sequence(self): """ Returns True if this :class:`.TableCollection` has an associated :ref:`reference sequence<sec_data_model_reference_sequence>`. """ return bool(self._ll_tables.has_reference_sequence())
@property def reference_sequence(self): """ The :class:`.ReferenceSequence` associated with this :class:`.TableCollection`. .. note:: Note that the behaviour of this attribute differs from :attr:`.TreeSequence.reference_sequence` in that we return a valid instance of :class:`.ReferenceSequence` even when :attr:`.TableCollection.has_reference_sequence` is False. This is to allow us to update the state of the reference sequence. """ # NOTE: arguably we should cache the reference to this object # during init, rather than creating a new instance each time. # However, following the pattern of the Table classes for now # for consistency. return ReferenceSequence(self._ll_tables.reference_sequence) @reference_sequence.setter def reference_sequence(self, value: ReferenceSequence): self.reference_sequence.metadata_schema = value.metadata_schema self.reference_sequence.metadata = value.metadata self.reference_sequence.data = value.data self.reference_sequence.url = value.url
[docs] def asdict(self, force_offset_64=False): """ Returns the nested dictionary representation of this TableCollection used for interchange. Note: the semantics of this method changed at tskit 0.1.0. Previously a map of table names to the tables themselves was returned. :param bool force_offset_64: If True, all offset columns will have dtype np.uint64. If False (the default) the offset array columns will have a dtype of either np.uint32 or np.uint64, depending on the size of the corresponding data array. :return: The dictionary representation of this table collection. :rtype: dict """ return self._ll_tables.asdict(force_offset_64)
@property def table_name_map(self) -> Dict: """ Returns a dictionary mapping table names to the corresponding table instances. For example, the returned dictionary will contain the key "edges" that maps to an :class:`.EdgeTable` instance. """ return { "edges": self.edges, "individuals": self.individuals, "migrations": self.migrations, "mutations": self.mutations, "nodes": self.nodes, "populations": self.populations, "provenances": self.provenances, "sites": self.sites, } @property def name_map(self) -> Dict: # Deprecated in 0.4.1 warnings.warn( "name_map is deprecated; use table_name_map instead", FutureWarning, stacklevel=4, ) return self.table_name_map @property def nbytes(self) -> int: """ Returns the total number of bytes required to store the data in this table collection. Note that this may not be equal to the actual memory footprint. """ return sum( ( 8, # sequence_length takes 8 bytes super().nbytes, # metadata len(self.time_units.encode()), self.indexes.nbytes, self.reference_sequence.nbytes, sum(table.nbytes for table in self.table_name_map.values()), ) ) def __str__(self): """ Return a plain text summary of this TableCollection """ return "\n".join( [ "TableCollection", "", f"Sequence Length: {self.sequence_length}", f"Time units: {self.time_units}", f"Metadata: {self.metadata}", "", "Individuals", str(self.individuals), "Nodes", str(self.nodes), "Edges", str(self.edges), "Sites", str(self.sites), "Mutations", str(self.mutations), "Migrations", str(self.migrations), "Populations", str(self.populations), "Provenances", str(self.provenances), ] )
[docs] def equals( self, other, *, ignore_metadata=False, ignore_ts_metadata=False, ignore_provenance=False, ignore_timestamps=False, ignore_tables=False, ignore_reference_sequence=False, ): """ Returns True if `self` and `other` are equal. By default, two table collections are considered equal if their - ``sequence_length`` properties are identical; - top-level tree sequence metadata and metadata schemas are byte-wise identical; - constituent tables are byte-wise identical. Some of the requirements in this definition can be relaxed using the parameters, which can be used to remove certain parts of the data model from the comparison. Table indexes are not considered in the equality comparison. :param TableCollection other: Another table collection. :param bool ignore_metadata: If True *all* metadata and metadata schemas will be excluded from the comparison. This includes the top-level tree sequence and constituent table metadata (default=False). :param bool ignore_ts_metadata: If True the top-level tree sequence metadata and metadata schemas will be excluded from the comparison. If ``ignore_metadata`` is True, this parameter has no effect. :param bool ignore_provenance: If True the provenance tables are not included in the comparison. :param bool ignore_timestamps: If True the provenance timestamp column is ignored in the comparison. If ``ignore_provenance`` is True, this parameter has no effect. :param bool ignore_tables: If True no tables are included in the comparison, thus comparing only the top-level information. :param bool ignore_reference_sequence: If True the reference sequence is not included in the comparison. :return: True if other is equal to this table collection; False otherwise. :rtype: bool """ ret = False if type(other) is type(self): ret = bool( self._ll_tables.equals( other._ll_tables, ignore_metadata=bool(ignore_metadata), ignore_ts_metadata=bool(ignore_ts_metadata), ignore_provenance=bool(ignore_provenance), ignore_timestamps=bool(ignore_timestamps), ignore_tables=bool(ignore_tables), ignore_reference_sequence=bool(ignore_reference_sequence), ) ) return ret
[docs] def assert_equals( self, other, *, ignore_metadata=False, ignore_ts_metadata=False, ignore_provenance=False, ignore_timestamps=False, ignore_tables=False, ignore_reference_sequence=False, ): """ Raise an AssertionError for the first found difference between this and another table collection. Note that table indexes are not checked. :param TableCollection other: Another table collection. :param bool ignore_metadata: If True *all* metadata and metadata schemas will be excluded from the comparison. This includes the top-level tree sequence and constituent table metadata (default=False). :param bool ignore_ts_metadata: If True the top-level tree sequence metadata and metadata schemas will be excluded from the comparison. If ``ignore_metadata`` is True, this parameter has no effect. :param bool ignore_provenance: If True the provenance tables are not included in the comparison. :param bool ignore_timestamps: If True the provenance timestamp column is ignored in the comparison. If ``ignore_provenance`` is True, this parameter has no effect. :param bool ignore_tables: If True no tables are included in the comparison, thus comparing only the top-level information. :param bool ignore_reference_sequence: If True the reference sequence is not included in the comparison. """ if type(other) is not type(self): raise AssertionError(f"Types differ: self={type(self)} other={type(other)}") # Check using the low-level method to avoid slowly going through everything if self.equals( other, ignore_metadata=ignore_metadata, ignore_ts_metadata=ignore_ts_metadata, ignore_provenance=ignore_provenance, ignore_timestamps=ignore_timestamps, ignore_tables=ignore_tables, ignore_reference_sequence=ignore_reference_sequence, ): return if not ignore_metadata or ignore_ts_metadata: super().assert_equals(other) if not ignore_reference_sequence: self.reference_sequence.assert_equals( other.reference_sequence, ignore_metadata=ignore_metadata ) if self.time_units != other.time_units: raise AssertionError( f"Time units differs: self={self.time_units} " f"other={other.time_units}" ) if self.sequence_length != other.sequence_length: raise AssertionError( f"Sequence Length" f" differs: self={self.sequence_length} other={other.sequence_length}" ) for table_name, table in self.table_name_map.items(): if table_name != "provenances": table.assert_equals( getattr(other, table_name), ignore_metadata=ignore_metadata ) if not ignore_provenance: self.provenances.assert_equals( other.provenances, ignore_timestamps=ignore_timestamps ) raise AssertionError( "TableCollections differ in an undetected way - " "this is a bug, please report an issue on gitub" ) # pragma: no cover
def __eq__(self, other): return self.equals(other) def __getstate__(self): return self.asdict() @classmethod def load(cls, file_or_path, *, skip_tables=False, skip_reference_sequence=False): file, local_file = util.convert_file_like_to_open_file(file_or_path, "rb") ll_tc = _tskit.TableCollection() try: ll_tc.load( file, skip_tables=skip_tables, skip_reference_sequence=skip_reference_sequence, ) return TableCollection(ll_tables=ll_tc) except tskit.FileFormatError as e: util.raise_known_file_format_errors(file, e) finally: if local_file: file.close()
[docs] def dump(self, file_or_path): """ Writes the table collection to the specified path or file object. :param str file_or_path: The file object or path to write the TreeSequence to. """ file, local_file = util.convert_file_like_to_open_file(file_or_path, "wb") try: self._ll_tables.dump(file) finally: if local_file: file.close()
# Unpickle support def __setstate__(self, state): self.__init__() self._ll_tables.fromdict(state) @classmethod def fromdict(self, tables_dict): ll_tc = _tskit.TableCollection() ll_tc.fromdict(tables_dict) return TableCollection(ll_tables=ll_tc)
[docs] def copy(self): """ Returns a deep copy of this TableCollection. :return: A deep copy of this TableCollection. :rtype: tskit.TableCollection """ return TableCollection.fromdict(self.asdict())
[docs] def tree_sequence(self): """ Returns a :class:`TreeSequence` instance from the tables defined in this :class:`TableCollection`, building the required indexes if they have not yet been created by :meth:`.build_index`. If the table collection does not meet the :ref:`sec_valid_tree_sequence_requirements`, for example if the tables are not correctly sorted or if they cannot be interpreted as a tree sequence, an exception is raised. Note that in the former case, the :meth:`.sort` method may be used to ensure that sorting requirements are met. :return: A :class:`TreeSequence` instance reflecting the structures defined in this set of tables. :rtype: tskit.TreeSequence """ if not self.has_index(): self.build_index() return tskit.TreeSequence.load_tables(self)
[docs] def simplify( self, samples=None, *, reduce_to_site_topology=False, filter_populations=None, filter_individuals=None, filter_sites=None, filter_nodes=None, update_sample_flags=None, keep_unary=False, keep_unary_in_individuals=None, keep_input_roots=False, record_provenance=True, filter_zero_mutation_sites=None, # Deprecated alias for filter_sites ): """ Simplifies the tables in place to retain only the information necessary to reconstruct the tree sequence describing the given ``samples``. If ``filter_nodes`` is True (the default), this can change the ID of the nodes, so that the node ``samples[k]`` will have ID ``k`` in the result, resulting in a NodeTable where only the first ``len(samples)`` nodes are marked as samples. The mapping from node IDs in the current set of tables to their equivalent values in the simplified tables is returned as a numpy array. If an array ``a`` is returned by this function and ``u`` is the ID of a node in the input table, then ``a[u]`` is the ID of this node in the output table. For any node ``u`` that is not mapped into the output tables, this mapping will equal ``tskit.NULL`` (``-1``). Tables operated on by this function must: be sorted (see :meth:`TableCollection.sort`), have children be born strictly after their parents, and the intervals on which any node is a child must be disjoint. Other than this the tables need not satisfy remaining requirements to specify a valid tree sequence (but the resulting tables will). .. note:: To invert the returned ``node_map``, that is, to obtain a reverse mapping from the node ID in the output table to the node ID in the input table, you can use:: rev_map = np.zeros_like(node_map, shape=simplified_ts.num_nodes) kept = node_map != tskit.NULL rev_map[node_map[kept]] = np.arange(len(node_map))[kept] In this case, no elements of the ``rev_map`` array will be set to ``tskit.NULL``. .. seealso:: This is identical to :meth:`TreeSequence.simplify` but acts *in place* to alter the data in this :class:`TableCollection`. Please see the :meth:`TreeSequence.simplify` method for a description of the remaining parameters. :param list[int] samples: A list of node IDs to retain as samples. They need not be nodes marked as samples in the original tree sequence, but will constitute the entire set of samples in the returned tree sequence. If not specified or None, use all nodes marked with the IS_SAMPLE flag. The list may be provided as a numpy array (or array-like) object (dtype=np.int32). :param bool reduce_to_site_topology: Whether to reduce the topology down to the trees that are present at sites. (Default: False). :param bool filter_populations: If True, remove any populations that are not referenced by nodes after simplification; new population IDs are allocated sequentially from zero. If False, the population table will not be altered in any way. (Default: None, treated as True) :param bool filter_individuals: If True, remove any individuals that are not referenced by nodes after simplification; new individual IDs are allocated sequentially from zero. If False, the individual table will not be altered in any way. (Default: None, treated as True) :param bool filter_sites: If True, remove any sites that are not referenced by mutations after simplification; new site IDs are allocated sequentially from zero. If False, the site table will not be altered in any way. (Default: None, treated as True) :param bool filter_nodes: If True, remove any nodes that are not referenced by edges after simplification. If False, the only potential change to the node table may be to change the node flags (if ``samples`` is specified and different from the existing samples). (Default: None, treated as True) :param bool update_sample_flags: If True, update node flags to so that nodes in the specified list of samples have the NODE_IS_SAMPLE flag after simplification, and nodes that are not in this list do not. (Default: None, treated as True) :param bool keep_unary: If True, preserve unary nodes (i.e. nodes with exactly one child) that exist on the path from samples to root. (Default: False) :param bool keep_unary_in_individuals: If True, preserve unary nodes that exist on the path from samples to root, but only if they are associated with an individual in the individuals table. Cannot be specified at the same time as ``keep_unary``. (Default: ``None``, equivalent to False) :param bool keep_input_roots: Whether to retain history ancestral to the MRCA of the samples. If ``False``, no topology older than the MRCAs of the samples will be included. If ``True`` the roots of all trees in the returned tree sequence will be the same roots as in the original tree sequence. (Default: False) :param bool record_provenance: If True, record details of this call to simplify in the returned tree sequence's provenance information (Default: True). :param bool filter_zero_mutation_sites: Deprecated alias for ``filter_sites``. :return: A numpy array mapping node IDs in the input tables to their corresponding node IDs in the output tables. :rtype: numpy.ndarray (dtype=np.int32) """ if filter_zero_mutation_sites is not None: # Deprecated in msprime 0.6.1. warnings.warn( "filter_zero_mutation_sites is deprecated; use filter_sites instead", FutureWarning, stacklevel=4, ) filter_sites = filter_zero_mutation_sites if samples is None: flags = self.nodes.flags samples = np.where(np.bitwise_and(flags, _tskit.NODE_IS_SAMPLE) != 0)[ 0 ].astype(np.int32) else: samples = util.safe_np_int_cast(samples, np.int32) if filter_populations is None: filter_populations = True if filter_individuals is None: filter_individuals = True if filter_sites is None: filter_sites = True if filter_nodes is None: filter_nodes = True if update_sample_flags is None: update_sample_flags = True if keep_unary_in_individuals is None: keep_unary_in_individuals = False node_map = self._ll_tables.simplify( samples, filter_sites=filter_sites, filter_individuals=filter_individuals, filter_populations=filter_populations, filter_nodes=filter_nodes, update_sample_flags=update_sample_flags, reduce_to_site_topology=reduce_to_site_topology, keep_unary=keep_unary, keep_unary_in_individuals=keep_unary_in_individuals, keep_input_roots=keep_input_roots, ) if record_provenance: # TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243 # TODO also make sure we convert all the arguments so that they are # definitely JSON encodable. parameters = {"command": "simplify", "TODO": "add simplify parameters"} self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) ) return node_map
def map_ancestors(self, *args, **kwargs): # A deprecated alias for link_ancestors() return self.link_ancestors(*args, **kwargs)
[docs] def sort(self, edge_start=0, *, site_start=0, mutation_start=0): """ Sorts the tables in place. This ensures that all tree sequence ordering requirements listed in the :ref:`sec_valid_tree_sequence_requirements` section are met, as long as each site has at most one mutation (see below). If the ``edge_start`` parameter is provided, this specifies the index in the edge table where sorting should start. Only rows with index greater than or equal to ``edge_start`` are sorted; rows before this index are not affected. This parameter is provided to allow for efficient sorting when the user knows that the edges up to a given index are already sorted. If both ``site_start`` and ``mutation_start`` are equal to the number of rows in their retrospective tables then neither is sorted. Note that a partial non-sorting is not possible, and both or neither must be skipped. The node, individual, population and provenance tables are not affected by this method. Edges are sorted as follows: - time of parent, then - parent node ID, then - child node ID, then - left endpoint. Note that this sorting order exceeds the :ref:`edge sorting requirements <sec_edge_requirements>` for a valid tree sequence. For a valid tree sequence, we require that all edges for a given parent ID are adjacent, but we do not require that they be listed in sorted order. Sites are sorted by position, and sites with the same position retain their relative ordering. Mutations are sorted by site ID, and within the same site are sorted by time. Those with equal or unknown time retain their relative ordering. This does not currently rearrange tables so that mutations occur after their mutation parents, which is a requirement for valid tree sequences. Migrations are sorted by ``time``, ``source``, ``dest``, ``left`` and ``node`` values. This defines a total sort order, such that any permutation of a valid migration table will be sorted into the same output order. Note that this sorting order exceeds the :ref:`migration sorting requirements <sec_migration_requirements>` for a valid tree sequence, which only requires that migrations are sorted by time value. :param int edge_start: The index in the edge table where sorting starts (default=0; must be <= len(edges)). :param int site_start: The index in the site table where sorting starts (default=0; must be one of [0, len(sites)]). :param int mutation_start: The index in the mutation table where sorting starts (default=0; must be one of [0, len(mutations)]). """ self._ll_tables.sort(edge_start, site_start, mutation_start)
# TODO add provenance
[docs] def sort_individuals(self): """ Sorts the individual table in place, so that parents come before children, and the parent column is remapped as required. Node references to individuals are also updated. """ self._ll_tables.sort_individuals()
# TODO add provenance
[docs] def canonicalise(self, remove_unreferenced=None): """ This puts the tables in *canonical* form, imposing a stricter order on the tables than :ref:`required <sec_valid_tree_sequence_requirements>` for a valid tree sequence. In particular, the individual and population tables are sorted by the first node that refers to each (see :meth:`TreeSequence.subset`). Then, the remaining tables are sorted as in :meth:`.sort`, with the modification that mutations are sorted by site, then time, then number of descendant mutations (ensuring that parent mutations occur before children), then node, then original order in the tables. This ensures that any two tables with the same information and node order should be identical after canonical sorting (note that no canonical order exists for the node table). By default, the method removes sites, individuals, and populations that are not referenced (by mutations and nodes, respectively). If you wish to keep these, pass ``remove_unreferenced=False``, but note that unreferenced individuals and populations are put at the end of the tables in their original order. .. seealso:: :meth:`.sort` for sorting edges, mutations, and sites, and :meth:`.subset` for reordering nodes, individuals, and populations. :param bool remove_unreferenced: Whether to remove unreferenced sites, individuals, and populations (default=True). """ remove_unreferenced = ( True if remove_unreferenced is None else remove_unreferenced ) self._ll_tables.canonicalise(remove_unreferenced=remove_unreferenced)
# TODO add provenance
[docs] def compute_mutation_parents(self): """ Modifies the tables in place, computing the ``parent`` column of the mutation table. For this to work, the node and edge tables must be valid, and the site and mutation tables must be sorted (see :meth:`TableCollection.sort`). This will produce an error if mutations are not sorted (i.e., if a mutation appears before its mutation parent) *unless* the two mutations occur on the same branch, in which case there is no way to detect the error. The ``parent`` of a given mutation is the ID of the next mutation encountered traversing the tree upwards from that mutation, or ``NULL`` if there is no such mutation. """ self._ll_tables.compute_mutation_parents()
# TODO add provenance
[docs] def compute_mutation_times(self): """ Modifies the tables in place, computing valid values for the ``time`` column of the mutation table. For this to work, the node and edge tables must be valid, and the site and mutation tables must be sorted and indexed(see :meth:`TableCollection.sort` and :meth:`TableCollection.build_index`). For a single mutation on an edge at a site, the ``time`` assigned to a mutation by this method is the mid-point between the times of the nodes above and below the mutation. In the case where there is more than one mutation on an edge for a site, the times are evenly spread along the edge. For mutations that are above a root node, the time of the root node is assigned. The mutation table will be sorted if the new times mean that the original order is no longer valid. """ self._ll_tables.compute_mutation_times()
# TODO add provenance
[docs] def deduplicate_sites(self): """ Modifies the tables in place, removing entries in the site table with duplicate ``position`` (and keeping only the *first* entry for each site), and renumbering the ``site`` column of the mutation table appropriately. This requires the site table to be sorted by position. .. warning:: This method does not sort the tables afterwards, so mutations may no longer be sorted by time. """ self._ll_tables.deduplicate_sites()
# TODO add provenance
[docs] def delete_sites(self, site_ids, record_provenance=True): """ Remove the specified sites entirely from the sites and mutations tables in this collection. This is identical to :meth:`TreeSequence.delete_sites` but acts *in place* to alter the data in this :class:`TableCollection`. :param list[int] site_ids: A list of site IDs specifying the sites to remove. :param bool record_provenance: If ``True``, add details of this operation to the provenance table in this TableCollection. (Default: ``True``). """ keep_sites = np.ones(len(self.sites), dtype=bool) site_ids = util.safe_np_int_cast(site_ids, np.int32) if np.any(site_ids < 0) or np.any(site_ids >= len(self.sites)): raise ValueError("Site ID out of bounds") keep_sites[site_ids] = 0 new_as, new_as_offset = keep_with_offset( keep_sites, self.sites.ancestral_state, self.sites.ancestral_state_offset ) new_md, new_md_offset = keep_with_offset( keep_sites, self.sites.metadata, self.sites.metadata_offset ) self.sites.set_columns( position=self.sites.position[keep_sites], ancestral_state=new_as, ancestral_state_offset=new_as_offset, metadata=new_md, metadata_offset=new_md_offset, ) # We also need to adjust the mutations table, as it references into sites keep_mutations = keep_sites[self.mutations.site] new_ds, new_ds_offset = keep_with_offset( keep_mutations, self.mutations.derived_state, self.mutations.derived_state_offset, ) new_md, new_md_offset = keep_with_offset( keep_mutations, self.mutations.metadata, self.mutations.metadata_offset ) # Site numbers will have changed site_map = np.cumsum(keep_sites, dtype=self.mutations.site.dtype) - 1 # Mutation numbers will change, so the parent references need altering mutation_map = np.cumsum(keep_mutations, dtype=self.mutations.parent.dtype) - 1 # Map parent == -1 to -1, and check this has worked (assumes tskit.NULL == -1) mutation_map = np.append(mutation_map, -1).astype(self.mutations.parent.dtype) assert mutation_map[tskit.NULL] == tskit.NULL self.mutations.set_columns( site=site_map[self.mutations.site[keep_mutations]], node=self.mutations.node[keep_mutations], time=self.mutations.time[keep_mutations], derived_state=new_ds, derived_state_offset=new_ds_offset, parent=mutation_map[self.mutations.parent[keep_mutations]], metadata=new_md, metadata_offset=new_md_offset, ) if record_provenance: # TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243 parameters = {"command": "delete_sites", "TODO": "add parameters"} self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def delete_intervals(self, intervals, simplify=True, record_provenance=True): """ Delete all information from this set of tables which lies *within* the specified list of genomic intervals. This is identical to :meth:`TreeSequence.delete_intervals` but acts *in place* to alter the data in this :class:`TableCollection`. :param array_like intervals: A list (start, end) pairs describing the genomic intervals to delete. Intervals must be non-overlapping and in increasing order. The list of intervals must be interpretable as a 2D numpy array with shape (N, 2), where N is the number of intervals. :param bool simplify: If True, run simplify on the tables so that nodes no longer used are discarded. (Default: True). :param bool record_provenance: If ``True``, add details of this operation to the provenance table in this TableCollection. (Default: ``True``). """ self.keep_intervals( util.negate_intervals(intervals, 0, self.sequence_length), simplify=simplify, record_provenance=False, ) if record_provenance: parameters = {"command": "delete_intervals", "TODO": "add parameters"} self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def keep_intervals(self, intervals, simplify=True, record_provenance=True): """ Delete all information from this set of tables which lies *outside* the specified list of genomic intervals. This is identical to :meth:`TreeSequence.keep_intervals` but acts *in place* to alter the data in this :class:`TableCollection`. :param array_like intervals: A list (start, end) pairs describing the genomic intervals to keep. Intervals must be non-overlapping and in increasing order. The list of intervals must be interpretable as a 2D numpy array with shape (N, 2), where N is the number of intervals. :param bool simplify: If True, run simplify on the tables so that nodes no longer used are discarded. Must be ``False`` if input tree sequence includes migrations. (Default: True). :param bool record_provenance: If ``True``, add details of this operation to the provenance table in this TableCollection. (Default: ``True``). """ intervals = util.intervals_to_np_array(intervals, 0, self.sequence_length) edges = self.edges.copy() self.edges.clear() migrations = self.migrations.copy() self.migrations.clear() keep_sites = np.repeat(False, self.sites.num_rows) for s, e in intervals: curr_keep_sites = np.logical_and( self.sites.position >= s, self.sites.position < e ) keep_sites = np.logical_or(keep_sites, curr_keep_sites) keep_edges = np.logical_not( np.logical_or(edges.right <= s, edges.left >= e) ) metadata, metadata_offset = keep_with_offset( keep_edges, edges.metadata, edges.metadata_offset ) self.edges.append_columns( left=np.fmax(s, edges.left[keep_edges]), right=np.fmin(e, edges.right[keep_edges]), parent=edges.parent[keep_edges], child=edges.child[keep_edges], metadata=metadata, metadata_offset=metadata_offset, ) keep_migrations = np.logical_not( np.logical_or(migrations.right <= s, migrations.left >= e) ) metadata, metadata_offset = keep_with_offset( keep_migrations, migrations.metadata, migrations.metadata_offset ) self.migrations.append_columns( left=np.fmax(s, migrations.left[keep_migrations]), right=np.fmin(e, migrations.right[keep_migrations]), node=migrations.node[keep_migrations], source=migrations.source[keep_migrations], dest=migrations.dest[keep_migrations], time=migrations.time[keep_migrations], metadata=metadata, metadata_offset=metadata_offset, ) self.delete_sites( np.where(np.logical_not(keep_sites))[0], record_provenance=False ) self.sort() if simplify: self.simplify(record_provenance=False) if record_provenance: parameters = {"command": "keep_intervals", "TODO": "add parameters"} self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
def _check_trim_conditions(self): if self.migrations.num_rows > 0: if (np.min(self.migrations.left) < np.min(self.edges.left)) and ( np.max(self.migrations.right) > np.max(self.edges.right) ): raise ValueError( "Cannot trim a tree sequence with migrations which exist to the" "left of the leftmost edge or to the right of the rightmost edge." ) if self.edges.num_rows == 0: raise ValueError( "Trimming a tree sequence with no edges would reduce the sequence length" " to zero, which is not allowed" )
[docs] def ltrim(self, record_provenance=True): """ Reset the coordinate system used in these tables, changing the left and right genomic positions in the edge table such that the leftmost edge now starts at position 0. This is identical to :meth:`TreeSequence.ltrim` but acts *in place* to alter the data in this :class:`TableCollection`. :param bool record_provenance: If ``True``, add details of this operation to the provenance table in this TableCollection. (Default: ``True``). """ self._check_trim_conditions() leftmost = np.min(self.edges.left) self.delete_sites( np.where(self.sites.position < leftmost), record_provenance=False ) self.edges.set_columns( left=self.edges.left - leftmost, right=self.edges.right - leftmost, parent=self.edges.parent, child=self.edges.child, ) self.sites.set_columns( position=self.sites.position - leftmost, ancestral_state=self.sites.ancestral_state, ancestral_state_offset=self.sites.ancestral_state_offset, metadata=self.sites.metadata, metadata_offset=self.sites.metadata_offset, ) self.migrations.set_columns( left=self.migrations.left - leftmost, right=self.migrations.right - leftmost, time=self.migrations.time, node=self.migrations.node, source=self.migrations.source, dest=self.migrations.dest, ) self.sequence_length = self.sequence_length - leftmost if record_provenance: # TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243 parameters = { "command": "ltrim", } self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def rtrim(self, record_provenance=True): """ Reset the ``sequence_length`` property so that the sequence ends at the end of the last edge. This is identical to :meth:`TreeSequence.rtrim` but acts *in place* to alter the data in this :class:`TableCollection`. :param bool record_provenance: If ``True``, add details of this operation to the provenance table in this TableCollection. (Default: ``True``). """ self._check_trim_conditions() rightmost = np.max(self.edges.right) self.delete_sites( np.where(self.sites.position >= rightmost), record_provenance=False ) self.sequence_length = rightmost if record_provenance: # TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243 parameters = { "command": "rtrim", } self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def trim(self, record_provenance=True): """ Trim away any empty regions on the right and left of the tree sequence encoded by these tables. This is identical to :meth:`TreeSequence.trim` but acts *in place* to alter the data in this :class:`TableCollection`. :param bool record_provenance: If ``True``, add details of this operation to the provenance table in this TableCollection. (Default: ``True``). """ self.rtrim(record_provenance=False) self.ltrim(record_provenance=False) if record_provenance: # TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243 parameters = { "command": "trim", } self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def delete_older(self, time): """ Deletes edge, mutation and migration information at least as old as the specified time. .. seealso:: This method is similar to the higher-level :meth:`TreeSequence.decapitate` method, which also splits edges that intersect with the given time. :meth:`TreeSequence.decapitate` is more useful for most purposes, and may be what you need instead of this method! For the purposes of this method, an edge covers the times from the child node up until the *parent* node, so that any any edge with parent node time > ``time`` will be removed. Any mutation whose time is >= ``time`` will be removed. A mutation's time is its associated ``time`` value, or the time of its node if the mutation's time was marked as unknown (:data:`UNKNOWN_TIME`). Any migration with time >= ``time`` will be removed. The node table is not affected by this operation. .. note:: This method does not have any specific sorting requirements and will maintain mutation parent mappings. :param float time: The cutoff time. """ self._ll_tables.delete_older(time)
[docs] def clear( self, clear_provenance=False, clear_metadata_schemas=False, clear_ts_metadata_and_schema=False, ): """ Remove all rows of the data tables, optionally remove provenance, metadata schemas and ts-level metadata. :param bool clear_provenance: If ``True``, remove all rows of the provenance table. (Default: ``False``). :param bool clear_metadata_schemas: If ``True``, clear the table metadata schemas. (Default: ``False``). :param bool clear_ts_metadata_and_schema: If ``True``, clear the tree-sequence level metadata and schema (Default: ``False``). """ self._ll_tables.clear( clear_provenance=clear_provenance, clear_metadata_schemas=clear_metadata_schemas, clear_ts_metadata_and_schema=clear_ts_metadata_and_schema, )
[docs] def has_index(self): """ Returns True if this TableCollection is indexed. See :ref:`sec_table_indexes` for information on indexes. """ return bool(self._ll_tables.has_index())
[docs] def build_index(self): """ Builds an index on this TableCollection. Any existing indexes are automatically dropped. See :ref:`sec_table_indexes` for information on indexes. """ self._ll_tables.build_index()
[docs] def drop_index(self): """ Drops any indexes present on this table collection. If the tables are not currently indexed this method has no effect. See :ref:`sec_table_indexes` for information on indexes. """ self._ll_tables.drop_index()
[docs] def subset( self, nodes, record_provenance=True, *, reorder_populations=None, remove_unreferenced=None, ): """ Modifies the tables in place to contain only the entries referring to the provided list of node IDs, with nodes reordered according to the order they appear in the list. Other tables are :meth:`sorted <sort>` to conform to the :ref:`sec_valid_tree_sequence_requirements`, and additionally sorted as described in the documentation for the equivalent tree sequence method :meth:`TreeSequence.subset`: please see this for more detail. :param list nodes: The list of nodes for which to retain information. This may be a numpy array (or array-like) object (dtype=np.int32). :param bool record_provenance: Whether to record a provenance entry in the provenance table for this operation. :param bool reorder_populations: Whether to reorder the population table (default: True). If False, the population table will not be altered in any way. :param bool remove_unreferenced: Whether sites, individuals, and populations that are not referred to by any retained entries in the tables should be removed (default: True). See the description for details. """ reorder_populations = ( True if reorder_populations is None else reorder_populations ) remove_unreferenced = ( True if remove_unreferenced is None else remove_unreferenced ) nodes = util.safe_np_int_cast(nodes, np.int32) self._ll_tables.subset( nodes, reorder_populations=reorder_populations, remove_unreferenced=remove_unreferenced, ) self.sort() if record_provenance: parameters = {"command": "subset", "nodes": nodes.tolist()} self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def union( self, other, node_mapping, check_shared_equality=True, add_populations=True, record_provenance=True, ): """ Modifies the table collection in place by adding the non-shared portions of ``other`` to itself. To perform the node-wise union, the method relies on a ``node_mapping`` array, that maps nodes in ``other`` to its equivalent node in ``self`` or ``tskit.NULL`` if the node is exclusive to ``other``. See :meth:`TreeSequence.union` for a more detailed description. :param TableCollection other: Another table collection. :param list node_mapping: An array of node IDs that relate nodes in ``other`` to nodes in ``self``: the k-th element of ``node_mapping`` should be the index of the equivalent node in ``self``, or ``tskit.NULL`` if the node is not present in ``self`` (in which case it will be added to self). :param bool check_shared_equality: If True, the shared portions of the table collections will be checked for equality. :param bool add_populations: If True, nodes new to ``self`` will be assigned new population IDs. :param bool record_provenance: Whether to record a provenance entry in the provenance table for this operation. """ node_mapping = util.safe_np_int_cast(node_mapping, np.int32) self._ll_tables.union( other._ll_tables, node_mapping, check_shared_equality=check_shared_equality, add_populations=add_populations, ) if record_provenance: other_records = [prov.record for prov in other.provenances] other_timestamps = [prov.timestamp for prov in other.provenances] parameters = { "command": "union", "other": {"timestamp": other_timestamps, "record": other_records}, "node_mapping": node_mapping.tolist(), } self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)) )
[docs] def ibd_segments( self, *, within=None, between=None, max_time=None, min_span=None, store_pairs=None, store_segments=None, ): """ Equivalent to the :meth:`TreeSequence.ibd_segments` method; please see its documentation for more details, and use this method only if you specifically need to work with a :class:`TableCollection` object. This method has the same data requirements as :meth:`TableCollection.simplify`. In particular, the tables in the collection have :ref:`required <sec_valid_tree_sequence_requirements>` sorting orders. To enforce this, you can call :meth:`TableCollection.sort` before using this method. If the edge table contains any edges with identical parents and children over adjacent genomic intervals, any IBD intervals underneath the edges will also be split across the breakpoint(s). To prevent this behaviour in this situation, use :meth:`EdgeTable.squash` beforehand. :param list within: As for the :meth:`TreeSequence.ibd_segments` method. :param list[list] between: As for the :meth:`TreeSequence.ibd_segments` method. :param float max_time: As for the :meth:`TreeSequence.ibd_segments` method. :param float min_span: As for the :meth:`TreeSequence.ibd_segments` method. :param bool store_pairs: As for the :meth:`TreeSequence.ibd_segments` method. :param bool store_segments: As for the :meth:`TreeSequence.ibd_segments` method. :return: An :class:`.IdentitySegments` object containing the recorded IBD information. :rtype: IdentitySegments """ max_time = np.inf if max_time is None else max_time min_span = 0 if min_span is None else min_span store_pairs = False if store_pairs is None else store_pairs store_segments = False if store_segments is None else store_segments if within is not None and between is not None: raise ValueError( "The ``within`` and ``between`` arguments are mutually exclusive" ) if between is not None: sample_set_sizes = np.array( [len(sample_set) for sample_set in between], dtype=np.uint64 ) # hstack has some annoying quirks around its handling of empty # lists which we need to work around. In a way it would be more # convenient to detect these conditions as errors, but then we # end up having to workaround edge cases in the tests and its # mathematically neater this way. pre_flattened = [lst for lst in between if len(lst) > 0] if len(pre_flattened) == 0: flattened = [] else: flattened = util.safe_np_int_cast(np.hstack(pre_flattened), np.int32) ll_result = self._ll_tables.ibd_segments_between( sample_set_sizes=sample_set_sizes, sample_sets=flattened, max_time=max_time, min_span=min_span, store_pairs=store_pairs, store_segments=store_segments, ) else: if within is not None: within = util.safe_np_int_cast(within, np.int32) ll_result = self._ll_tables.ibd_segments_within( samples=within, max_time=max_time, min_span=min_span, store_pairs=store_pairs, store_segments=store_segments, ) return IdentitySegments( ll_result, max_time=max_time, min_span=min_span, store_pairs=store_pairs, store_segments=store_segments, )