Source code for arkouda.sorting

from __future__ import annotations

from enum import Enum
from typing import Sequence, Union, cast

from typeguard import check_type, typechecked

from arkouda.client import generic_msg
from arkouda.numpy.dtypes import bigint, bool_, dtype, float64, int64, int_scalars, uint64
from arkouda.pdarrayclass import create_pdarray, pdarray
from arkouda.pdarraycreation import zeros
from arkouda.strings import Strings

numeric_dtypes = {dtype(int64), dtype(uint64), dtype(float64)}

__all__ = ["argsort", "coargsort", "sort"]

SortingAlgorithm = Enum("SortingAlgorithm", ["RadixSortLSD", "TwoArrayRadixSort"])


[docs] def argsort( pda: Union[pdarray, Strings, "Categorical"], # type: ignore # noqa algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD, axis: int_scalars = 0, ) -> pdarray: """ Return the permutation that sorts the array. Parameters ---------- pda : pdarray or Strings or Categorical The array to sort (int64, uint64, or float64) Returns ------- pdarray, int64 The indices such that ``pda[indices]`` is sorted Raises ------ TypeError Raised if the parameter is other than a pdarray or Strings See Also -------- coargsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> perm = ak.argsort(a) >>> a[perm] array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ from arkouda.categorical import Categorical if axis < -1 or axis > pda.ndim: raise ValueError(f"Axis must be between -1 and the PD Array's rank ({pda.ndim})") if axis == -1: axis = pda.ndim - 1 check_type(argname="argsort", value=pda, expected_type=Union[pdarray, Strings, Categorical]) if hasattr(pda, "argsort"): return cast(Categorical, pda).argsort() if pda.size == 0 and hasattr(pda, "dtype"): return zeros(0, dtype=pda.dtype) if isinstance(pda, pdarray) and pda.dtype == bigint: return coargsort(pda.bigint_to_uint_arrays(), algorithm) if isinstance(pda, Strings): repMsg = generic_msg( cmd="argsortStrings", args={ "name": pda.entry.name, "algoName": algorithm.name, }, ) else: repMsg = generic_msg( cmd=f"argsort<{pda.dtype.name},{pda.ndim}>", args={ "name": pda.name, "algoName": algorithm.name, "objType": pda.objType, "axis": axis, }, ) return create_pdarray(cast(str, repMsg))
[docs] def coargsort( arrays: Sequence[Union[Strings, pdarray, "Categorical"]], # type: ignore # noqa algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD, ) -> pdarray: """ Return the permutation that groups the rows (left-to-right), if the input arrays are treated as columns. The permutation sorts numeric columns, but not strings/Categoricals -- strings/Categoricals are grouped, but not ordered. Parameters ---------- arrays : Sequence[Union[Strings, pdarray, Categorical]] The columns (int64, uint64, float64, Strings, or Categorical) to sort by row Returns ------- pdarray, int64 The indices that permute the rows to grouped order Raises ------ ValueError Raised if the pdarrays are not of the same size or if the parameter is not an Iterable containing pdarrays, Strings, or Categoricals See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Starts with the last array and moves forward. This sort operates directly on numeric types, but for Strings, it operates on a hash. Thus, while grouping of equivalent strings is guaranteed, lexicographic ordering of the groups is not. For Categoricals, coargsort sorts based on Categorical.codes which guarantees grouping of equivalent categories but not lexicographic ordering of those groups. Examples -------- >>> a = ak.array([0, 1, 0, 1]) >>> b = ak.array([1, 1, 0, 0]) >>> perm = ak.coargsort([a, b]) >>> perm array([2, 0, 3, 1]) >>> a[perm] array([0, 0, 1, 1]) >>> b[perm] array([0, 1, 0, 1]) """ from arkouda.categorical import Categorical from arkouda.numeric import cast as akcast check_type( argname="coargsort", value=arrays, expected_type=Sequence[Union[pdarray, Strings, Categorical]] ) size: int_scalars = -1 anames = [] atypes = [] expanded_arrays = [] for a in arrays: if not isinstance(a, pdarray) or a.dtype not in [bigint, bool_]: expanded_arrays.append(a) elif a.dtype == bigint: expanded_arrays.extend(a.bigint_to_uint_arrays()) else: # cast bool arrays to int expanded_arrays.append(akcast(a, "int")) for a in expanded_arrays: if isinstance(a, pdarray): anames.append(a.name) atypes.append(a.objType) elif isinstance(a, Categorical): anames.append(a.codes.name) atypes.append(a.objType) elif isinstance(a, Strings): atypes.append(a.objType) anames.append(a.entry.name) else: raise ValueError("Argument must be an iterable of pdarrays, Strings, or Categoricals") if size == -1: size = a.size elif size != a.size: raise ValueError("All pdarrays, Strings, or Categoricals must be of the same size") if size == 0: return zeros(0, dtype=int if isinstance(arrays[0], (Strings, Categorical)) else arrays[0].dtype) repMsg = generic_msg( cmd="coargsort", args={ "algoName": algorithm.name, "nstr": len(expanded_arrays), "arr_names": anames, "arr_types": atypes, }, ) return create_pdarray(cast(str, repMsg))
[docs] @typechecked def sort(pda: pdarray, algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD, axis=-1) -> pdarray: """ Return a sorted copy of the array. Only sorts numeric arrays; for Strings, use argsort. Parameters ---------- pda : pdarray or Categorical The array to sort (int64, uint64, or float64) Returns ------- pdarray, int64, uint64, or float64 The sorted copy of pda Raises ------ TypeError Raised if the parameter is not a pdarray ValueError Raised if sort attempted on a pdarray with an unsupported dtype such as bool See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> sorted = ak.sort(a) >>> a array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if pda.dtype == bigint: return pda[coargsort(pda.bigint_to_uint_arrays(), algorithm)] if pda.dtype not in numeric_dtypes: raise ValueError(f"ak.sort supports int64, uint64, or float64, not {pda.dtype}") if pda.size == 0: return zeros(0, dtype=pda.dtype) repMsg = generic_msg( cmd=f"sort{pda.ndim}D", args={"alg": algorithm.name, "array": pda, "axis": axis} ) return create_pdarray(cast(str, repMsg))