tskit 0.4.0

A quick rundown of whats coming in 0.4.0, full changelog with links to issues at https://github.com/tskit-dev/tskit/releases/tag/0.4.0

import tskit
import msprime
import numpy as np
tskit.__version__
'0.4.0'

Breaking changes

!tskit info tests/data/SLiM/single-locus-example.trees
╔═══════════════════════╗
║TreeSequence           ║
╠═══════════════╤═══════╣
║Trees          │      1║
╟───────────────┼───────╢
║Sequence Length│     10║
╟───────────────┼───────╢
║Time Units     │unknown║
╟───────────────┼───────╢
║Sample Nodes   │     10║
╟───────────────┼───────╢
║Total Size     │1.8 KiB║
╚═══════════════╧═══════╝
╔═══════════╤════╤═════════╤════════════╗
║Table      │Rows│Size     │Has Metadata║
╠═══════════╪════╪═════════╪════════════╣
║Edges      │  12│392 Bytes│          No║
╟───────────┼────┼─────────┼────────────╢
║Individuals│   5│404 Bytes│         Yes║
╟───────────┼────┼─────────┼────────────╢
║Migrations │   0│  8 Bytes│          No║
╟───────────┼────┼─────────┼────────────╢
║Mutations  │   0│ 16 Bytes│          No║
╟───────────┼────┼─────────┼────────────╢
║Nodes      │  15│578 Bytes│         Yes║
╟───────────┼────┼─────────┼────────────╢
║Populations│   2│112 Bytes│         Yes║
╟───────────┼────┼─────────┼────────────╢
║Provenances│   1│176 Bytes│          No║
╟───────────┼────┼─────────┼────────────╢
║Sites      │   0│ 16 Bytes│          No║
╚═══════════╧════╧═════════╧════════════╝
# Lets make a tree sequence to use in subsequent examples
ts = msprime.sim_ancestry(20, sequence_length=10)
ts = msprime.sim_mutations(ts,rate=1)
tc = ts.tables
ts
Tree Sequence
Trees1
Sequence Length10.0
Time Unitsunknown
Sample Nodes40
Total Size14.4 KiB
MetadataNo Metadata
Table Rows Size Has Metadata
Edges 78 2.4 KiB
Individuals 20 584 Bytes
Migrations 0 8 Bytes
Mutations 180 6.5 KiB
Nodes 79 2.2 KiB
Populations 1 224 Bytes
Provenances 2 1.6 KiB
Sites 10 266 Bytes
# This means the dtypes for offset columns has changed
tc.nodes.metadata_offset.dtype
dtype('uint64')
# Get a tree to look at
t = ts.first()
t
Tree
Index0
Interval0-10 (10)
Roots1
Nodes79
Sites10
Mutations180
Total Branch Length17.525159
#How many usual nodes are in the tree
len(t.preorder())
79
#The quintuply linked arrays have an extra element - the virtual root
len(t.parent_array)
80
#The normal root is the penultimate node in those arrays:
t.root
78
#And it's parent is NULL as before
t.parent_array[t.root]
-1
#The virtual root is the last element
t.virtual_root
79
# And its children are the normal roots
t.left_child_array[t.virtual_root]
78
# Encode some bad metadata and see that we get a nice, helpful error now.
tskit.metadata.JSONCodec({"codec":"json"}).encode({"a":np.array([])})
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

~/projects/tskit/python/tskit/metadata.py in encode(self, obj)
    158         try:
--> 159             return tskit.canonical_json(obj).encode()
    160         except TypeError as e:


~/projects/tskit/python/tskit/util.py in canonical_json(obj)
     57     """
---> 58     return json.dumps(obj, sort_keys=True, separators=(",", ":"))
     59 


~/miniconda3/lib/python3.9/json/__init__.py in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
    233         cls = JSONEncoder
--> 234     return cls(
    235         skipkeys=skipkeys, ensure_ascii=ensure_ascii,


~/miniconda3/lib/python3.9/json/encoder.py in encode(self, o)
    198         # equivalent to the PySequence_Fast that ''.join() would do.
--> 199         chunks = self.iterencode(o, _one_shot=True)
    200         if not isinstance(chunks, (list, tuple)):


~/miniconda3/lib/python3.9/json/encoder.py in iterencode(self, o, _one_shot)
    256                 self.skipkeys, _one_shot)
--> 257         return _iterencode(o, 0)
    258 


~/miniconda3/lib/python3.9/json/encoder.py in default(self, o)
    178         """
--> 179         raise TypeError(f'Object of type {o.__class__.__name__} '
    180                         f'is not JSON serializable')


TypeError: Object of type ndarray is not JSON serializable


During handling of the above exception, another exception occurred:


MetadataEncodingError                     Traceback (most recent call last)

/tmp/ipykernel_2089335/467480944.py in <module>
      1 # Encode some bad metadata and see that we get a nice, helpful error now.
----> 2 tskit.metadata.JSONCodec({"codec":"json"}).encode({"a":np.array([])})


~/projects/tskit/python/tskit/metadata.py in encode(self, obj)
    159             return tskit.canonical_json(obj).encode()
    160         except TypeError as e:
--> 161             raise exceptions.MetadataEncodingError(
    162                 f"Could not encode metadata of type {str(e).split()[3]}"
    163             )


MetadataEncodingError: Could not encode metadata of type ndarray
# To skip sorting, specifiy to start sorting at the end.
tc.sort(site_start=ts.num_sites, mutation_start=ts.num_mutations)
# Method is similar to other sorts
tc.sort_individuals()
# Lets replace this row for one with metadata
tc.nodes[0]
NodeTableRow(flags=1, time=0.0, population=0, individual=0, metadata=b'')
#Firstly set a schema
tc.nodes.metadata_schema = tskit.metadata.MetadataSchema({"codec":"json"})
#Then we can simply assign a row with replaced metadata
tc.nodes[0] = tc.nodes[0].replace(metadata={"foo":"bar"})
#If you want to replace an attribute on all rows - this way is faster:
copy=tc.nodes.copy()
tc.nodes.clear()
for n in copy:
    tc.nodes.append(n.replace(metadata={"foo":"bar"}))
ts.samples()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39], dtype=int32)
#Specify time as an interval
ts.samples(time=(0.1,0.2))
array([], dtype=int32)
#old way md = [n.metadata["foo"] for n in tc.nodes]
tc.nodes.metadata_vector(key=["foo"], default_value="wtf")
array(['bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar',
       'bar', 'bar', 'bar', 'bar', 'bar', 'bar', 'bar'], dtype='<U3')
tc.time_units
'unknown'
#Make a tree sequence where the time dimension is uncalibrated
tc.time_units=tskit.TIME_UNITS_UNCALIBRATED
tc.time_units
ts = tc.tree_sequence()
#An error gets thrown if you try to do a statistic that makes no sense given the dimension
ts.allele_frequency_spectrum(mode="branch")
---------------------------------------------------------------------------

LibraryError                              Traceback (most recent call last)

/tmp/ipykernel_2089335/3925147128.py in <module>
      1 #An error gets thrown if you try to do a statistic that makes no sense given the dimension
----> 2 ts.allele_frequency_spectrum(mode="branch")


~/projects/tskit/python/tskit/trees.py in allele_frequency_spectrum(self, sample_sets, windows, mode, span_normalise, polarised)
   7214         if sample_sets is None:
   7215             sample_sets = [self.samples()]
-> 7216         return self.__one_way_sample_set_stat(
   7217             self._ll_tree_sequence.allele_frequency_spectrum,
   7218             sample_sets,


~/projects/tskit/python/tskit/trees.py in __one_way_sample_set_stat(self, ll_method, sample_sets, windows, mode, span_normalise, polarised)
   6499 
   6500         flattened = util.safe_np_int_cast(np.hstack(sample_sets), np.int32)
-> 6501         stat = self.__run_windowed_stat(
   6502             windows,
   6503             ll_method,


~/projects/tskit/python/tskit/trees.py in __run_windowed_stat(self, windows, method, *args, **kwargs)
   6461         strip_dim = windows is None
   6462         windows = self.parse_windows(windows)
-> 6463         stat = method(*args, **kwargs, windows=windows)
   6464         if strip_dim:
   6465             stat = stat[0]


LibraryError: Statistics using branch lengths cannot be calculated when time_units is 'uncalibrated'
# For our example all the positions are integers
ts.discrete_genome
True
ts.discrete_time
False
list(ts.alignments())
list(ts.alignments(reference_sequence=tskit.random_nucleotides(ts.sequence_length)))
['GAGTCCTGCA',
 'GAAGCCAATG',
 'AAGCACATGA',
 'GTCTCGTCTA',
 'GAGTCATACT',
 'GAGTCATACT',
 'CATGTGATAA',
 'TTATCGCACT',
 'CCCGTGCACC',
 'AAACTCCTGT',
 'TAAAACATGA',
 'TTCTCAATCG',
 'GAATCGAATA',
 'TTATCGGACG',
 'GAAGCTAGCC',
 'TTATCGCAAT',
 'AAGTACATGA',
 'TTATCGGACG',
 'TTATCGGACG',
 'TTCTCAATCG',
 'GAACCTAGTC',
 'TGGGAATTCG',
 'TACAACATTA',
 'GAAGCCAATC',
 'AAGTACATGA',
 'CGCGCCCGCC',
 'ACTTTAATGC',
 'GAACCGAGTC',
 'GAACCGAGTC',
 'TTATCGCAAT',
 'TTATCGCACC',
 'ACCTACATGC',
 'GAACCGAGTC',
 'CCCGTGTACC',
 'GTATCGCACT',
 'AAGTACATGA',
 'ACATACATGC',
 'ATGTCGTATA',
 'GAGACATACC',
 'TACTACATTA']

Reference sequence

You can now store a reference in the tree sequence! (jeromekelleher, benjeffery) This API is preliminary, we intend to add ways to fetch from URLs or files to avoid duplication.

tc.reference_sequence.data = "ATCG"
tc.reference_sequence.metadata_schema = tskit.MetadataSchema({"codec":"json"})
tc.reference_sequence.metadata = {"accession":"someid", "other":"stuff"}
tc.reference_sequence.data
'ATCG'
tc.reference_sequence.metadata
{'accession': 'someid', 'other': 'stuff'}