diff --git a/CHANGELOG.md b/CHANGELOG.md index 36c9fe5..c848b1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Implemented R's `split()` utility to split a sequence by a grouping factor. - Turn `match()` into a generic for specialization by other BiocPy classes. +- Implement `duplicated()` generic for simple sequences and Factors. ## Version 0.3.0 - 0.3.4 diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 4a68565..fb40e20 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -63,4 +63,6 @@ from .biocobject import BiocObject from .table import table + +from .duplicated import duplicated, unique from .split import split diff --git a/src/biocutils/duplicated.py b/src/biocutils/duplicated.py new file mode 100644 index 0000000..303af2a --- /dev/null +++ b/src/biocutils/duplicated.py @@ -0,0 +1,201 @@ +from functools import singledispatch +from typing import Any, Sequence, Union + +import numpy + +from .Factor import Factor +from .subset import subset + + +@singledispatch +def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> numpy.ndarray: + """ + Find duplicated elements of ``x``. + + Args: + x: + Object to be searched for duplicates. + This is usually a sequence that can be iterated over. + + incomparables: + Values of ``x`` that cannot be compared. + Any value of ``x`` in ``incomparables`` will never be a duplicate. + Any object that has an ``__in__`` method can be used here. + + from_last: + Whether to report the last occurrence as a non-duplicate. + + Returns: + NumPy array of length equal to that of ``x``, + containing truthy values for only the first occurrence of each value of ``x``. + If ``from_last = True``, truthy values are only reported for the last occurrence of each value of ``x``. + + Examples: + >>> import biocutils + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... 1, + ... 2, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... 1, + ... 2, + ... 3, + ... 2, + ... ], + ... from_last=True, + ... ) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ], + ... incomparables=set( + ... [None] + ... ), + ... ) + """ + + available = set() + output = numpy.ndarray(len(x), dtype=numpy.bool_) + + def process(i, y): + if y in incomparables: + output[i] = False + elif y in available: + output[i] = True + else: + available.add(y) + output[i] = False + + if not from_last: + for i, y in enumerate(x): + process(i, y) + else: + for i in range(len(x) - 1, -1, -1): + process(i, x[i]) + + return output + + +@duplicated.register +def _duplicated_Factor( + x: Factor, incomparables: Union[set, Sequence] = set(), from_last: bool = False +) -> numpy.ndarray: + present = [] + for lev in x.get_levels(): + if lev in incomparables: + present.append(None) + else: + present.append(False) + + # Handling codes of -1, i.e., None. + if None in incomparables: + present.append(None) + else: + present.append(False) + + output = numpy.ndarray(len(x), dtype=numpy.bool_) + + def process(i, y): + tmp = present[y] + if tmp is None: + output[i] = False + elif tmp: + output[i] = True + else: + present[y] = True + output[i] = False + + if not from_last: + for i, y in enumerate(x.get_codes()): + process(i, y) + else: + codes = x.get_codes() + for i in range(len(x) - 1, -1, -1): + process(i, codes[i]) + + return output + + +def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> Any: + """ + Get all unique values of ``x``. + + Args: + x: + Object in which to find unique entries. + This is usually a sequence that can be iterated over. + + incomparables: + Values of ``x`` that cannot be compared. + Any value of ``x`` in ``incomparables`` will never be a duplicate. + Any object that has an ``__in__`` method can be used here. + + from_last: + Whether to retain the last occurrence of each value in ``x``. + By default, the first occurrence is retained. + + Returns: + An object containing unique values of ``x``. + This is usually of the same class as ``x``. + + Examples: + >>> import biocutils + >>> biocutils.unique( + ... [ + ... 1, + ... 2, + ... 1, + ... 2, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.unique( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.unique( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ], + ... incomparables=set( + ... [None] + ... ), + ... ) + """ + return subset(x, numpy.where(numpy.logical_not(duplicated(x, incomparables=incomparables, from_last=from_last)))[0]) diff --git a/src/biocutils/match.py b/src/biocutils/match.py index 1570970..1ebaaf1 100644 --- a/src/biocutils/match.py +++ b/src/biocutils/match.py @@ -1,5 +1,5 @@ -from typing import Any, Optional, Literal, Union, Sequence from functools import singledispatch +from typing import Any, Literal, Optional, Sequence, Union import numpy diff --git a/tests/test_duplicated.py b/tests/test_duplicated.py new file mode 100644 index 0000000..3edeb6c --- /dev/null +++ b/tests/test_duplicated.py @@ -0,0 +1,23 @@ +import biocutils + + +def test_duplicated_basic(): + assert list(biocutils.duplicated([1,2,1,2,3,2])) == [False, False, True, True, False, True] + assert list(biocutils.duplicated([1,2,1,2,3,2], from_last=True)) == [True, True, False, True, False, False] + assert list(biocutils.duplicated([1,2,None,None,3,2,3])) == [False, False, False, True, False, True, True] + assert list(biocutils.duplicated([1,2,None,None,3,2,3], incomparables=set([None]))) == [False, False, False, False, False, True, True] + + +def test_duplicated_Factor(): + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]))) == [False, False, True, True, False, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]), from_last=True)) == [True, True, False, True, False, False] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]))) == [False, False, False, True, False, True, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set([None]))) == [False, False, False, False, False, True, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set(["2"]))) == [False, False, False, True, False, False, True] + + +def test_unique(): + assert biocutils.unique([1,2,1,2,3,2]) == [1,2,3] + assert biocutils.unique([1,2,1,2,3,2], from_last=True) == [1,3,2] + assert biocutils.unique([1,2,None,None,3,2]) == [1,2,None,3] + assert biocutils.unique([1,2,None,None,3,2], incomparables=set([None])) == [1,2,None,None,3]