from __future__ import annotations
from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, overload
from typing import cast as type_cast
import numpy as np
import pandas as pd
from numpy import ndarray
from numpy.typing import NDArray
from pandas import CategoricalDtype as pd_CategoricalDtype
from pandas import StringDtype as pd_StringDtype
from pandas.api.extensions import ExtensionArray
from pandas.core.dtypes.dtypes import ExtensionDtype
from arkouda.numpy.pdarrayclass import pdarray
from ._arkouda_array import ArkoudaArray
from ._arkouda_extension_array import ArkoudaExtensionArray
from ._arkouda_string_array import ArkoudaStringArray
from ._dtypes import ArkoudaCategoricalDtype
if TYPE_CHECKING:
from arkouda.pandas.categorical import Categorical
else:
Categorical = TypeVar("Categorical")
__all__ = ["ArkoudaCategorical"]
[docs]
class ArkoudaCategorical(ArkoudaExtensionArray, ExtensionArray):
"""
Arkouda-backed categorical pandas ExtensionArray.
Ensures the underlying data is an Arkouda ``Categorical``. Accepts an existing
``Categorical`` or converts from Python/NumPy sequences of labels.
Parameters
----------
data : Categorical | ArkoudaCategorical | ndarray | Sequence[Any]
Input to wrap or convert.
- If ``Categorical``, used directly.
- If another ``ArkoudaCategorical``, its backing object is reused.
- If list/tuple/ndarray, converted via ``ak.Categorical(ak.array(data))``.
Raises
------
TypeError
If ``data`` cannot be converted to Arkouda ``Categorical``.
Attributes
----------
default_fill_value : str
Sentinel used when filling missing values (default: "").
"""
default_fill_value: str = ""
def __init__(self, data: Categorical | "ArkoudaCategorical" | ndarray | Sequence[Any]):
from arkouda import Categorical as AkCategorical
from arkouda import array
if isinstance(data, ArkoudaCategorical):
self._data = data._data
return
if not isinstance(data, AkCategorical):
try:
data = AkCategorical(array(data))
except Exception as e:
raise TypeError(
f"Expected arkouda.Categorical or sequence convertible to one, "
f"got {type(data).__name__}"
) from e
self._data = data
def __getitem__(self, key: Any) -> Any:
"""
Retrieve one or more categorical values.
Parameters
----------
key : Any
Location(s) to retrieve. Supported forms include:
* scalar integer index
* slice objects (e.g. ``1:3``)
* NumPy integer array (any integer dtype)
* NumPy boolean mask with the same length as the array
* Python list of integers or booleans
* Arkouda ``pdarray`` of integers or booleans
Returns
-------
Any
A Python scalar for scalar access, or a new
:class:`ArkoudaCategorical` for non-scalar indexers.
Raises
------
TypeError
If a NumPy indexer with an unsupported dtype is provided.
Examples
--------
>>> import numpy as np
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaCategorical
>>> data = ak.Categorical(ak.array(["a", "b", "c", "d"]))
>>> arr = ArkoudaCategorical(data)
Scalar access returns a Python string-like scalar:
>>> arr[1]
np.str_('b')
Negative indexing:
>>> arr[-1]
np.str_('d')
Slice indexing returns a new ArkoudaCategorical:
>>> result = arr[1:3]
>>> type(result)
<class 'arkouda.pandas.extension._arkouda_categorical_array.ArkoudaCategorical'>
NumPy integer array indexing:
>>> idx = np.array([0, 2], dtype=np.int64)
>>> sliced = arr[idx]
>>> isinstance(sliced, ArkoudaCategorical)
True
NumPy boolean mask:
>>> mask = np.array([True, False, True, False])
>>> masked = arr[mask]
>>> isinstance(masked, ArkoudaCategorical)
True
Empty integer indexer returns an empty ArkoudaCategorical:
>>> empty_idx = np.array([], dtype=np.int64)
>>> empty = arr[empty_idx]
>>> len(empty)
0
"""
import numpy as np
from arkouda.numpy.pdarraycreation import array as ak_array
from arkouda.pandas.categorical import Categorical
# Handle empty indexer (list / tuple / ndarray of length 0)
if isinstance(key, (list, tuple, np.ndarray)) and len(key) == 0:
empty_strings = ak_array([], dtype="str_")
return ArkoudaCategorical(Categorical(empty_strings))
# Scalar integers and slices: delegate directly to the underlying Categorical
if isinstance(key, (int, np.integer, slice)):
result = self._data[key]
# For scalar keys, just return the underlying scalar
if isinstance(key, (int, np.integer)):
return result
# For slices, underlying arkouda.Categorical returns a Categorical
return ArkoudaCategorical(result)
# NumPy array indexers: normalize to Arkouda pdarrays
if isinstance(key, np.ndarray):
if key.dtype == bool:
key = ak_array(key)
elif np.issubdtype(key.dtype, np.signedinteger):
key = ak_array(key, dtype="int64")
elif np.issubdtype(key.dtype, np.unsignedinteger):
key = ak_array(key, dtype="uint64")
else:
raise TypeError(f"Unsupported numpy index type {key.dtype}")
elif not isinstance(key, (pdarray, Categorical)):
# Convert generic indexers (e.g. Python lists of ints/bools) to an Arkouda pdarray
key = ak_array(key)
# Delegate to underlying arkouda.Categorical
result = self._data[key]
# Scalar result: just return the underlying scalar
if isinstance(key, pdarray) and key.size == 1:
# Categorical.__getitem__ will generally still give a Categorical here;
# we normalize to a Python scalar by going through categories[codes].
codes = result.codes if isinstance(result, Categorical) else result
cats = self._data.categories
# codes is length-1, so this is length-1 Strings
labels = cats[codes]
# Return a Python scalar string
return labels[0]
# Non-scalar: wrap Categorical in ArkoudaCategorical
if isinstance(result, Categorical):
return ArkoudaCategorical(result)
# Fallback: if Categorical returned something array-like but not Categorical,
# rebuild a Categorical from it.
return ArkoudaCategorical(Categorical(result))
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
from arkouda import Categorical, array
# if 'scalars' are raw labels (strings), build ak.Categorical
if not isinstance(scalars, Categorical):
scalars = Categorical(array(scalars))
return cls(scalars)
@overload
def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ...
@overload
def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ...
@overload
def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ...
[docs]
def astype(
self,
dtype: Any,
copy: bool = True,
) -> Union[ExtensionArray, NDArray[Any]]:
"""
Cast to a specified dtype.
* If ``dtype`` is categorical (pandas ``category`` / ``CategoricalDtype`` /
``ArkoudaCategoricalDtype``), returns an Arkouda-backed
``ArkoudaCategorical`` (optionally copied).
* If ``dtype`` requests ``object``, returns a NumPy ``ndarray`` of dtype object
containing the category labels (materialized to the client).
* If ``dtype`` requests a string dtype, returns an Arkouda-backed
``ArkoudaStringArray`` containing the labels as strings.
* Otherwise, casts the labels (as strings) to the requested dtype and returns an
Arkouda-backed ExtensionArray.
Parameters
----------
dtype : Any
Target dtype.
copy : bool
Whether to force a copy when possible. If categorical-to-categorical and
``copy=True``, attempts to copy the underlying Arkouda ``Categorical`` (if
supported). Default is True.
Returns
-------
Union[ExtensionArray, NDArray[Any]]
The cast result. Returns a NumPy array only when casting to ``object``;
otherwise returns an Arkouda-backed ExtensionArray.
Examples
--------
Casting to ``category`` returns an Arkouda-backed categorical array:
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaCategorical
>>> c = ArkoudaCategorical(ak.Categorical(ak.array(["x", "y", "x"])))
>>> out = c.astype("category")
>>> out is c
False
Forcing a copy when casting to the same categorical dtype returns a new array:
>>> out2 = c.astype("category", copy=True)
>>> out2 is c
False
>>> out2.to_ndarray()
array(['x', 'y', 'x'], dtype='<U...')
Casting to ``object`` materializes the category labels to a NumPy object array:
>>> c.astype(object)
array(['x', 'y', 'x'], dtype=object)
Casting to a string dtype returns an Arkouda-backed string array of labels:
>>> s = c.astype("string")
>>> s.to_ndarray()
array(['x', 'y', 'x'], dtype='<U1')
Casting to another dtype casts the labels-as-strings and returns an Arkouda-backed array:
>>> c_num = ArkoudaCategorical(ak.Categorical(ak.array(["1", "2", "3"])))
>>> a = c_num.astype("int64")
>>> a.to_ndarray()
array([1, 2, 3])
"""
from arkouda.numpy._typing._typing import is_string_dtype_hint
# --- 1) ExtensionDtype branch first: proves overload #2 returns ExtensionArray ---
if isinstance(dtype, ExtensionDtype):
if hasattr(dtype, "numpy_dtype"):
dtype = dtype.numpy_dtype
if isinstance(dtype, (ArkoudaCategoricalDtype, pd_CategoricalDtype)) or dtype in (
"category",
):
if not copy:
return self
data = self._data.copy() if hasattr(self._data, "copy") else self._data
return type_cast(ExtensionArray, type(self)(data))
data = self._data.to_strings()
if isinstance(dtype, pd_StringDtype) or is_string_dtype_hint(dtype):
return type_cast(ExtensionArray, ArkoudaStringArray._from_sequence(data))
casted = data.astype(dtype)
return type_cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted))
# --- 2) object -> numpy ---
if dtype in (object, np.object_, "object", np.dtype("O")):
return self.to_ndarray().astype(object, copy=False)
if isinstance(dtype, (ArkoudaCategoricalDtype, pd_CategoricalDtype)) or dtype in ("category",):
if not copy:
return self
data = self._data.copy() if hasattr(self._data, "copy") else self._data
return type(self)(data)
data = self._data.to_strings()
if isinstance(dtype, pd_StringDtype) or is_string_dtype_hint(dtype):
return ArkoudaStringArray._from_sequence(data)
casted = data.astype(dtype)
return ArkoudaExtensionArray._from_sequence(casted)
[docs]
def isna(self) -> np.ndarray:
"""
# Return a boolean mask indicating missing values.
# This implements the pandas ExtensionArray.isna contract and returns a
# NumPy ndarray[bool] of the same length as this categorical array.
# Returns
# -------
# np.ndarray
# Boolean mask where True indicates a missing value.
# Raises
# ------
# TypeError
# If the underlying categorical cannot expose its codes or if missing
# detection is unsupported.
#
"""
from arkouda.pandas.categorical import Categorical
data = self._data # should be an arkouda.Categorical
if not isinstance(data, Categorical):
raise TypeError("ArkoudaCategorical.isna requires an arkouda.Categorical backend")
# Missing values in ArkoudaCategorical are represented by code == -1
try:
return (data.codes == -1).to_ndarray()
except Exception as e:
raise TypeError(f"Unable to determine missing values: {e}") from e
[docs]
def isnull(self):
"""Alias for isna()."""
return self.isna()
@property
def dtype(self):
return ArkoudaCategoricalDtype()
def __eq__(self, other):
"""Elementwise equality for ArkoudaCategorical."""
from arkouda.numpy.pdarrayclass import pdarray
from arkouda.numpy.pdarraycreation import array as ak_array
from arkouda.pandas.categorical import Categorical
# Case 1: Categorical vs Categorical
if isinstance(other, ArkoudaCategorical):
if len(self) != len(other):
raise ValueError("Lengths must match for elementwise comparison")
return ArkoudaArray(self._data == other._data)
# Case 2: Categorical vs arkouda pdarray (e.g., codes or labels, depending on ak semantics)
if isinstance(other, pdarray):
if other.size not in (1, len(self)):
raise ValueError("Lengths must match for elementwise comparison")
return ArkoudaArray(self._data == other)
# Case 3: scalar (string / category label / code)
if np.isscalar(other):
return ArkoudaArray(self._data == other)
# Case 4: numpy array or Python sequence
if isinstance(other, (list, tuple, np.ndarray)):
other_ak = Categorical(ak_array(other))
if other_ak.size == 1:
return ArkoudaArray(self._data == other_ak[0])
if other_ak.size != len(self):
raise ValueError("Lengths must match for elementwise comparison")
return ArkoudaArray(self._data == other_ak)
# Case 5: unsupported type
return NotImplemented
def __repr__(self):
return f"ArkoudaCategorical({self._data})"
[docs]
def value_counts(self, dropna: bool = True) -> pd.Series:
"""
Return counts of categories as a pandas Series.
This method computes category frequencies from the underlying Arkouda
``Categorical`` and returns them as a pandas ``Series``, where the
index contains the category labels and the values contain the
corresponding counts.
Parameters
----------
dropna : bool
Whether to drop missing values from the result. When ``True``,
the result is filtered using the categorical's ``na_value``.
When ``False``, all categories returned by the underlying
computation are included. Default is True.
Returns
-------
pd.Series
A Series containing category counts.
The index is an ``ArkoudaStringArray`` of category labels and the
values are an ``ArkoudaArray`` of counts.
Notes
-----
- The result is computed server-side in Arkouda; only the (typically small)
output of categories and counts is materialized for the pandas ``Series``.
- This method does not yet support pandas options such as ``normalize``,
``sort``, or ``bins``.
- The handling of missing values depends on the Arkouda ``Categorical``
definition of ``na_value``.
Examples
--------
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaCategorical
>>>
>>> a = ArkoudaCategorical(["a", "b", "a", "c", "b", "a"])
>>> a.value_counts()
a 3
b 2
c 1
dtype: int64
"""
import pandas as pd
from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray
from arkouda.pandas.groupbyclass import GroupBy
cat = self._data
codes = cat.codes
if codes.size == 0:
return pd.Series(dtype="int64")
grouped_codes, counts = GroupBy(codes).size()
categories = cat.categories[grouped_codes]
if dropna is True:
mask = categories != cat.na_value
categories = categories[mask]
counts = counts[mask]
if categories.size == 0:
return pd.Series(dtype="int64")
return pd.Series(
ArkoudaArray._from_sequence(counts),
index=ArkoudaStringArray._from_sequence(categories),
)
# ------------------------------------------------------------------
# pandas.Categorical-specific API that is not yet implemented
# ------------------------------------------------------------------
def _categorical_not_implemented(self, name: str):
raise NotImplementedError(f"{name} is not yet implemented for ArkoudaCategorical.")
def _categories_match_up_to_permutation(self, *args, **kwargs):
self._categorical_not_implemented("_categories_match_up_to_permutation")
def _constructor(self, *args, **kwargs):
self._categorical_not_implemented("_constructor")
def _dir_additions(self, *args, **kwargs):
self._categorical_not_implemented("_dir_additions")
def _dir_deletions(self, *args, **kwargs):
self._categorical_not_implemented("_dir_deletions")
def _encode_with_my_categories(self, *args, **kwargs):
self._categorical_not_implemented("_encode_with_my_categories")
def _from_inferred_categories(self, *args, **kwargs):
self._categorical_not_implemented("_from_inferred_categories")
def _get_values_repr(self, *args, **kwargs):
self._categorical_not_implemented("_get_values_repr")
def _internal_get_values(self, *args, **kwargs):
self._categorical_not_implemented("_internal_get_values")
def _replace(self, *args, **kwargs):
self._categorical_not_implemented("_replace")
def _repr_categories(self, *args, **kwargs):
self._categorical_not_implemented("_repr_categories")
def _reset_cache(self, *args, **kwargs):
self._categorical_not_implemented("_reset_cache")
def _reverse_indexer(self, *args, **kwargs):
self._categorical_not_implemented("_reverse_indexer")
def _set_categories(self, *args, **kwargs):
self._categorical_not_implemented("_set_categories")
def _set_dtype(self, *args, **kwargs):
self._categorical_not_implemented("_set_dtype")
def _unbox_scalar(self, *args, **kwargs):
self._categorical_not_implemented("_unbox_scalar")
def _validate_codes_for_dtype(self, *args, **kwargs):
self._categorical_not_implemented("_validate_codes_for_dtype")
def _validate_listlike(self, *args, **kwargs):
self._categorical_not_implemented("_validate_listlike")
def _values_for_rank(self, *args, **kwargs):
self._categorical_not_implemented("_values_for_rank")
[docs]
def add_categories(self, *args, **kwargs):
self._categorical_not_implemented("add_categories")
[docs]
def as_ordered(self, *args, **kwargs):
self._categorical_not_implemented("as_ordered")
[docs]
def as_unordered(self, *args, **kwargs):
self._categorical_not_implemented("as_unordered")
[docs]
def check_for_ordered(self, *args, **kwargs):
self._categorical_not_implemented("check_for_ordered")
[docs]
def describe(self, *args, **kwargs):
self._categorical_not_implemented("describe")
[docs]
@classmethod
def from_codes(cls, *args, **kwargs):
raise NotImplementedError("from_codes is not yet implemented for ArkoudaCategorical.")
[docs]
def memory_usage(self, *args, **kwargs):
self._categorical_not_implemented("memory_usage")
[docs]
def notna(self, *args, **kwargs):
self._categorical_not_implemented("notna")
[docs]
def notnull(self, *args, **kwargs):
self._categorical_not_implemented("notnull")
[docs]
def remove_categories(self, *args, **kwargs):
self._categorical_not_implemented("remove_categories")
[docs]
def remove_unused_categories(self, *args, **kwargs):
self._categorical_not_implemented("remove_unused_categories")
[docs]
def rename_categories(self, *args, **kwargs):
self._categorical_not_implemented("rename_categories")
[docs]
def reorder_categories(self, *args, **kwargs):
self._categorical_not_implemented("reorder_categories")
[docs]
def set_categories(self, *args, **kwargs):
self._categorical_not_implemented("set_categories")
[docs]
def set_ordered(self, *args, **kwargs):
self._categorical_not_implemented("set_ordered")
[docs]
def sort_values(self, *args, **kwargs):
self._categorical_not_implemented("sort_values")
[docs]
def to_list(self, *args, **kwargs):
self._categorical_not_implemented("to_list")
[docs]
def max(self, *args, **kwargs):
self._categorical_not_implemented("max")
[docs]
def min(self, *args, **kwargs):
self._categorical_not_implemented("min")