diff --git a/CHANGELOG.md b/CHANGELOG.md index ed832efc82a8dc48a18229e142e5289cc371b158..a08564833f39f7b9ed3c0e71e7f3ada2d6c1f628 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ The `newick` package adheres to [Semantic Versioning](http://semver.org/spec/v2. ## [Unreleased] +## [v1.3.1] - 2021-10-14 + +Fixed support for node annotations for the case when annotations are between `:` and length. + + ## [v1.3.0] - 2021-05-04 Added support for reading and writing of node annotations (in comments). diff --git a/README.md b/README.md index 96ff265214443f8fb7b101ac6bb27fb073ece386..0b09547b966d2000848fb454e4d4c66fedf624e3 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,19 @@ The `newick` package allows to deal with comments in two ways. >>> newick.loads('(a[annotation],b)c;')[0].newick '(a[annotation],b)c' ``` + Annotations may come before or after the `:` which separates node label and length: +- ```python + >>> newick.loads('(a[annotation]:2,b)c;')[0].descendants[0].length + 2.0 + >>> newick.loads('(a:[annotation]2,b)c;')[0].descendants[0].length + 2.0 + ``` + but if they preceed the colon, they must not contain `:`: +- ```python + >>> newick.loads('(a[annotation:]:2,b)c;')[0].descendants[0].comment + ... + ValueError: Node names or branch lengths must not contain ":" + ``` Note that square brackets inside *quoted labels* will **not** be interpreted as comments or annotations: diff --git a/setup.py b/setup.py index 2bb6fe7e8178b871065a8e60abb644c4cc35c514..00c50052c20b20631d14706e009a695352415aea 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name='newick', - version="1.3.1.dev0", + version="1.3.1", description='A python module to read and write the Newick format', long_description=open("README.md").read(), long_description_content_type="text/markdown", diff --git a/src/newick.py b/src/newick.py index f28341a7b17b0f8110074ea14db0869f37c2e3d2..170e3258955696e62d25a59cf2051112d4562652 100644 --- a/src/newick.py +++ b/src/newick.py @@ -6,7 +6,7 @@ Functionality to read and write the Newick serialization format for trees. import re import pathlib -__version__ = "1.3.1.dev0" +__version__ = "1.3.1" RESERVED_PUNCTUATION = ':;,()' COMMENT = re.compile(r'\[[^]]*]') @@ -39,8 +39,7 @@ class Node(object): """ for char in RESERVED_PUNCTUATION: if (name and char in name) or (length and char in length): - raise ValueError( - 'Node names or branch lengths must not contain "%s"' % char) + raise ValueError('Node names or branch lengths must not contain "{}"'.format(char)) self.name = name self.comment = comment self._length = length @@ -48,6 +47,7 @@ class Node(object): self.ancestor = None self._length_parser = kw.pop('length_parser', length_parser) self._length_formatter = kw.pop('length_formatter', length_formatter) + self._colon_before_comment = kw.pop('colon_before_comment', False) def __repr__(self): return 'Node("%s")' % self.name @@ -86,11 +86,17 @@ class Node(object): @property def newick(self): """The representation of the Node in Newick format.""" + colon_done = False label = self.name or '' if self.comment: + if self._length and self._colon_before_comment: + label += ':' + colon_done = True label += '[{}]'.format(self.comment) if self._length: - label += ':' + self._length + if not colon_done: + label += ':' + label += self._length descendants = ','.join([n.newick for n in self.descendants]) if descendants: descendants = '(' + descendants + ')' @@ -426,16 +432,27 @@ def write(tree, fname, encoding='utf8'): def _parse_name_and_length(s): - length, comment = None, None + length, comment, colon_before_comment = None, None, False if ':' in s: - parts = s.split(':') - if ']' not in parts[-1]: # A ] in length doesn't make sense - the : must be in a comment. - s = ':'.join(parts[:-1]) - length = parts[-1] + # Comments may be placed between ":" and length, or between name and ":"! + # In any case, we interpret the first occurrence of ":" as separator for length, i.e. + # a ":" in an annotation **before** the length separator will screw things up. + before, _, after = s.partition(':') + if before.endswith(']'): + assert '[' in before and '[' not in after, s + s, comment = before[:-1].split('[', maxsplit=1) + length = after + elif after.startswith('['): + assert ']' in after and '[' not in before, s + colon_before_comment = True + comment, length = after[1:].split(']', maxsplit=1) + s = before + else: + s, length = before, after if '[' in s and s.endswith(']'): # This looks like a node annotation in a comment. s, comment = s.split('[', maxsplit=1) comment = comment[:-1] - return s or None, length or None, comment + return s or None, length or None, comment, colon_before_comment def _parse_siblings(s, **kw): @@ -484,5 +501,11 @@ def parse_node(s, strip_comments=False, **kw): raise ValueError('unmatched braces %s' % parts[0][:100]) descendants = list(_parse_siblings(')'.join(parts[:-1])[1:], **kw)) label = parts[-1] - name, length, comment = _parse_name_and_length(label) - return Node.create(name=name, length=length, comment=comment, descendants=descendants, **kw) + name, length, comment, colon_before_comment = _parse_name_and_length(label) + return Node.create( + name=name, + length=length, + comment=comment, + colon_before_comment=colon_before_comment, + descendants=descendants, + **kw) diff --git a/tests/test_newick.py b/tests/test_newick.py index 24b40780c4795d1565e91c83b514308ff8de669f..103f1caa761880c99df8858470f3a35f85001fc2 100644 --- a/tests/test_newick.py +++ b/tests/test_newick.py @@ -391,4 +391,58 @@ def test_with_comments(): tree = loads(nwk)[0] assert tree.comment.startswith('y') assert tree.descendants[0].name == '1' and tree.descendants[0].comment.startswith('x') - assert tree.newick == nwk \ No newline at end of file + assert tree.newick == nwk + + +def test_with_comments_beast(): + nwk = "((((20:[&rate=9.363171791537587E-5]1320.9341043566992,(21:[&rate=9.363171791537587E-5]" \ + "1225.8822690335624,(((((15:[&rate=9.363171791537587E-5]638.1949811891477,16:[&rate=" \ + "9.363171791537587E-5]638.1949811891477):[&rate=9.363171791537587E-5]257.76795318129564" \ + ",8:[&rate=9.363171791537587E-5]895.9629343704433):[&rate=9.363171791537587E-5]" \ + "41.795862802882425,12:[&rate=9.363171791537587E-5]937.7587971733258):" \ + "[&rate=9.363171791537587E-5]95.6952785114238,14:[&rate=9.363171791537587E-5]" \ + "1033.4540756847496):[&rate=9.363171791537587E-5]59.28887326566064,((25:" \ + "[&rate=9.363171791537587E-5]368.1724945784702,28:[&rate=9.363171791537587E-" \ + "5]368.1724945784702):[&rate=9.363171791537587E-5]618.1292632448451,(13:[&rate=" \ + "9.363171791537587E-5]894.6169275367406,((22:[&rate=9.363171791537587E-5]532." \ + "4463352965287,33:[&rate=9.363171791537587E-5]532.4463352965287):[&rate=9." \ + "363171791537587E-5]124.75991679524702,19:[&rate=9.363171791537587E-5]657." \ + "2062520917757):[&rate=9.363171791537587E-5]237.4106754449649):[&rate=9." \ + "363171791537587E-5]91.68483028657465):[&rate=9.363171791537587E-5]106.44119112709495):" \ + "[&rate=9.363171791537587E-5]133.13932008315214):[&rate=9.363171791537587E-5]95." \ + "05183532313686):[&rate=9.363171791537587E-5]239.53051384576952,((23:[&rate=9." \ + "363171791537587E-5]886.6590941437129,2:[&rate=9.363171791537587E-5]886.6590941437129):" \ + "[&rate=9.363171791537587E-5]318.065540579532,((6:[&rate=9.363171791537587E-5]1128." \ + "8289029154403,37:[&rate=9.363171791537587E-5]1128.8289029154403):[&rate=9." \ + "363171791537587E-5]17.349382774569676,((((((3:[&rate=9.363171791537587E-5]459." \ + "5487115479798,36:[&rate=9.363171791537587E-5]459.5487115479798):[&rate=9." \ + "363171791537587E-5]306.57918484718175,(31:[&rate=9.363171791537587E-5]485." \ + "4575256190764,34:[&rate=9.363171791537587E-5]485.4575256190764):[&rate=9." \ + "363171791537587E-5]280.6703707760851):[&rate=9.363171791537587E-5]15.246829791795335," \ + "(30:[&rate=9.363171791537587E-5]543.1657161064542,1:[&rate=9.363171791537587E-5]543." \ + "1657161064542):[&rate=9.363171791537587E-5]238.2090100805027):[&rate=9." \ + "363171791537587E-5]118.69392508203657,((7:[&rate=9.363171791537587E-5]520." \ + "3998734304117,35:[&rate=9.363171791537587E-5]520.3998734304117):[&rate=9." \ + "363171791537587E-5]238.7668559806733,(32:[&rate=9.363171791537587E-5]720." \ + "2892667226898,17:[&rate=9.363171791537587E-5]720.2892667226898):[&rate=9." \ + "363171791537587E-5]38.87746268839521):[&rate=9.363171791537587E-5]140.9019218579084)" \ + ":[&rate=9.363171791537587E-5]52.21797041264119,26:[&rate=9.363171791537587E-5]" \ + "952.2866216816346):[&rate=9.363171791537587E-5]163.25701515522496,((18:[&rate=9." \ + "363171791537587E-5]720.6233628054213,10:[&rate=9.363171791537587E-5]720.6233628054213):" \ + "[&rate=9.363171791537587E-5]119.64362661776931,(29:[&rate=9.363171791537587E-5]617." \ + "5158316030422,(9:[&rate=9.363171791537587E-5]593.9192324440043,(11:[&rate=9." \ + "363171791537587E-5]472.3642192781455,27:[&rate=9.363171791537587E-5]472.3642192781455)" \ + ":[&rate=9.363171791537587E-5]121.55501316585872):[&rate=9.363171791537587E-5]23." \ + "596599159037964):[&rate=9.363171791537587E-5]222.75115782014836):[&rate=9." \ + "363171791537587E-5]275.276647413669):[&rate=9.363171791537587E-5]30.63464885315034):" \ + "[&rate=9.363171791537587E-5]58.54634903323495):[&rate=9.363171791537587E-5]355." \ + "73998347922384):[&rate=9.363171791537587E-5]1186.6682306101936,24:[&rate=9." \ + "363171791537587E-5]2747.1328488126624):[&rate=9.363171791537587E-5]301.4581721015056," \ + "(38:[&rate=9.363171791537587E-5]963.0459960655501,(5:[&rate=9.363171791537587E-5]500." \ + "66376645282014,4:[&rate=9.363171791537587E-5]500.66376645282014):[&rate=9." \ + "363171791537587E-5]462.38222961272993):[&rate=9.363171791537587E-5]2085.5450248486177)" + tree = loads(nwk)[0] + assert tree.descendants[0].comment == '&rate=9.363171791537587E-5' + assert tree.descendants[0].name is None + assert tree.descendants[0].length == pytest.approx(301.4581721015056) + assert tree.newick == nwk