from __future__ import annotations
from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, overload
from typing import cast as type_cast
import numpy as np
import pandas as pd
from numpy import ndarray
from numpy.typing import NDArray
from pandas import StringDtype as pd_StringDtype
from pandas.api.extensions import ExtensionArray
from pandas.core.dtypes.dtypes import ExtensionDtype
from arkouda.numpy.dtypes import str_
from arkouda.pandas.extension import ArkoudaArray
from ._arkouda_extension_array import ArkoudaExtensionArray
from ._dtypes import ArkoudaStringDtype
if TYPE_CHECKING:
from arkouda.numpy.strings import Strings
else:
Strings = TypeVar("Strings")
__all__ = ["ArkoudaStringArray"]
[docs]
class ArkoudaStringArray(ArkoudaExtensionArray, ExtensionArray):
"""
Arkouda-backed string pandas ExtensionArray.
Ensures the underlying data is an Arkouda ``Strings`` object. Accepts existing
``Strings`` or converts from NumPy arrays and Python sequences of strings.
Parameters
----------
data : Strings | ndarray | Sequence[Any] | ArkoudaStringArray
Input to wrap or convert.
- If ``Strings``, used directly.
- If NumPy/sequence, converted via ``ak.array``.
- If another ``ArkoudaStringArray``, its backing ``Strings`` is reused.
Raises
------
TypeError
If ``data`` cannot be converted to Arkouda ``Strings``.
Attributes
----------
default_fill_value : str
Sentinel used when filling missing values (default: "").
"""
default_fill_value: str = ""
def __init__(self, data: Strings | ndarray | Sequence[Any] | "ArkoudaStringArray"):
from arkouda.numpy.pdarraycreation import array as ak_array
from arkouda.numpy.strings import Strings
if isinstance(data, ArkoudaStringArray):
self._data = data._data
return
if isinstance(data, (np.ndarray, list, tuple)):
data = type_cast(Strings, ak_array(data, dtype="str_"))
if not isinstance(data, Strings):
raise TypeError(f"Expected arkouda.Strings, got {type(data).__name__}")
self._data = data
@property
def dtype(self):
return ArkoudaStringDtype()
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
from arkouda.numpy.pdarraycreation import array as ak_array
return cls(ak_array(scalars))
def __getitem__(self, key: Any) -> Any:
"""
Retrieve one or more string values.
Parameters
----------
key : Any
Positional indexer. Supports:
* scalar integer positions
* slice objects
* NumPy integer arrays (signed/unsigned)
* NumPy boolean masks
* Python lists of integers / booleans
* Arkouda pdarray indexers (int / uint / bool)
Returns
-------
Any
A Python string for scalar access, or a new ArkoudaStringArray
for non-scalar indexers.
Raises
------
TypeError
If ``key`` is a NumPy array with an unsupported dtype (for example,
a floating point or object dtype).
Examples
--------
Basic scalar access:
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaStringArray
>>> arr = ArkoudaStringArray(ak.array(["a", "b", "c", "d"]))
>>> arr[1]
np.str_('b')
Negative indexing:
>>> arr[-1]
np.str_('d')
Slice indexing (returns a new ArkoudaStringArray):
>>> arr[1:3]
ArkoudaStringArray(['b', 'c'])
NumPy integer array indexing:
>>> idx = np.array([0, 2], dtype=np.int64)
>>> arr[idx]
ArkoudaStringArray(['a', 'c'])
NumPy boolean mask:
>>> mask = np.array([True, False, True, False])
>>> arr[mask]
ArkoudaStringArray(['a', 'c'])
Arkouda integer indexer:
>>> ak_idx = ak.array([3, 1])
>>> arr[ak_idx]
ArkoudaStringArray(['d', 'b'])
Empty indexer returns an empty ArkoudaStringArray:
>>> empty_idx = np.array([], dtype=np.int64)
>>> arr[empty_idx]
ArkoudaStringArray([])
"""
from arkouda.numpy.pdarraycreation import array as ak_array
from arkouda.numpy.strings import Strings
# Normalize NumPy indexers to Arkouda pdarrays, mirroring ArkoudaArray.__getitem__
if isinstance(key, np.ndarray):
if key.dtype == bool:
key = ak_array(key)
elif key.dtype.kind in {"i"}:
# signed integer
key = ak_array(key, dtype="int64")
elif key.dtype.kind in {"u"}:
# unsigned integer
key = ak_array(key, dtype="uint64")
else:
raise TypeError(f"Unsupported numpy index type {key.dtype}")
result = self._data[key]
# Scalar access: return a plain Python str (or scalar) instead of a Strings object
if np.isscalar(key):
return result
# Non-scalar: expect an Arkouda Strings, wrap it
if isinstance(result, Strings):
return ArkoudaStringArray(result)
# Fallback: if Arkouda returned something array-like but not Strings,
# materialize via ak.array and wrap again as Strings.
return ArkoudaStringArray(ak_array(result))
# docstr-coverage:excused `typing-only overload stub`
@overload
def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ...
# docstr-coverage:excused `typing-only overload stub`
@overload
def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ...
# docstr-coverage:excused `typing-only overload stub`
@overload
def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ...
[docs]
def astype(
self,
dtype: Any,
copy: bool = True,
) -> Union[ExtensionArray, NDArray[Any]]:
"""
Cast to a specified dtype.
Casting rules:
* If ``dtype`` requests ``object``, returns a NumPy ``NDArray[Any]`` of dtype
``object`` containing the string values.
* If ``dtype`` is a string dtype (e.g. pandas ``StringDtype``, NumPy unicode,
or Arkouda string dtype), returns an ``ArkoudaStringArray``. If ``copy=True``,
attempts to copy the underlying Arkouda ``Strings`` data.
* For all other dtypes, casts the underlying Arkouda ``Strings`` using
``Strings.astype`` and returns an Arkouda-backed ``ArkoudaExtensionArray``
constructed from the result.
Parameters
----------
dtype : Any
Target dtype. May be a NumPy dtype, pandas dtype, or Arkouda dtype.
copy : bool
Whether to force a copy when the result is an ``ArkoudaStringArray``.
Default is True.
Returns
-------
Union[ExtensionArray, NDArray[Any]]
The cast result. Returns a NumPy array only when casting to ``object``;
otherwise returns an Arkouda-backed ExtensionArray.
Examples
--------
Casting to a string dtype returns an Arkouda-backed string array:
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaStringArray
>>> s = ArkoudaStringArray(ak.array(["a", "b", "c"]))
>>> out = s.astype("string")
>>> out is s
False
Forcing a copy when casting to a string dtype returns a new array:
>>> out2 = s.astype("string", copy=True)
>>> out2 is s
False
>>> out2.to_ndarray()
array(['a', 'b', 'c'], dtype='<U1')
Casting to ``object`` materializes the data to a NumPy array:
>>> s.astype(object)
array(['a', 'b', 'c'], dtype=object)
Casting to a non-string dtype uses Arkouda to cast the underlying strings
and returns an Arkouda-backed ExtensionArray:
>>> s_num = ArkoudaStringArray(ak.array(["1", "2", "3"]))
>>> a = s_num.astype("int64")
>>> a.to_ndarray()
array([1, 2, 3])
NumPy and pandas dtype objects are also accepted:
>>> import numpy as np
>>> a = s_num.astype(np.dtype("float64"))
>>> a.to_ndarray()
array([1., 2., 3.])
"""
from arkouda.numpy._typing._typing import is_string_dtype_hint
from arkouda.numpy.dtypes import dtype as ak_dtype
# --- 1) ExtensionDtype branch first (satisfies overload #2) ---
if isinstance(dtype, ExtensionDtype):
if hasattr(dtype, "numpy_dtype"):
dtype = dtype.numpy_dtype
if isinstance(dtype, pd_StringDtype) or is_string_dtype_hint(dtype):
if not copy:
return self
data = self._data.copy() if hasattr(self._data, "copy") else self._data
return type_cast(ExtensionArray, type(self)(data))
dtype = ak_dtype(dtype)
casted = self._data.astype(dtype)
return type_cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted))
# --- 2) object -> numpy (satisfies overload #1 / general) ---
if dtype in (object, np.object_, "object", np.dtype("O")):
return self.to_ndarray().astype(object, copy=False)
# string targets -> stay string EA
if isinstance(dtype, pd_StringDtype) or is_string_dtype_hint(dtype):
if not copy:
return self
data = self._data.copy() if hasattr(self._data, "copy") else self._data
return type(self)(data)
dtype = ak_dtype(dtype)
casted = self._data.astype(dtype)
return ArkoudaExtensionArray._from_sequence(casted)
[docs]
def isna(self):
from arkouda.numpy.pdarraycreation import zeros
return zeros(self._data.size, dtype="bool")
def __eq__(self, other):
"""
Elementwise equality for string arrays using pandas ExtensionArray semantics.
Returns ArkoudaArray of booleans.
"""
from arkouda.numpy.pdarrayclass import pdarray
from arkouda.numpy.pdarraycreation import array as ak_array
# Case 1: ArkoudaStringArray
if isinstance(other, ArkoudaStringArray):
if len(self) != len(other):
raise ValueError("Lengths must match for elementwise comparison")
return ArkoudaArray(self._data == other._data)
# Case 2: arkouda pdarray (should contain encoded string indices)
if isinstance(other, pdarray):
if other.size not in (1, len(self)):
raise ValueError("Lengths must match for elementwise comparison")
return ArkoudaArray(self._data == other)
# Case 3: scalar (string or bytes)
if isinstance(other, (str, str_)):
return ArkoudaArray(self._data == other)
# Case 4: numpy array or Python sequence
if isinstance(other, (list, tuple, np.ndarray)):
other_ak = ak_array(other)
if other_ak.size == 1:
return ArkoudaArray(self._data == other_ak[0])
if other_ak.size != len(self):
raise ValueError("Lengths must match for elementwise comparison")
return ArkoudaArray(self._data == other_ak)
# Case 5: unsupported type
return NotImplemented
def __repr__(self):
return f"ArkoudaStringArray({self._data})"
[docs]
def value_counts(self, dropna: bool = True) -> pd.Series:
"""
Return counts of unique strings as a pandas Series.
This method computes the frequency of each distinct string value in the
underlying Arkouda ``Strings`` object and returns the result as a pandas
``Series``, with the unique string values as the index and their counts
as the data.
Parameters
----------
dropna : bool
Whether to exclude missing values. Missing-value handling for
Arkouda string arrays is not yet implemented, so this parameter is
accepted for pandas compatibility but currently has no effect.
Default is True.
Returns
-------
pd.Series
A Series containing the counts of unique string values.
The index is an ``ArkoudaStringArray`` of unique values, and the
values are an ``ArkoudaArray`` of counts.
Notes
-----
- The following pandas options are not yet implemented:
``normalize``, ``sort``, and ``bins``.
- Counting is performed server-side in Arkouda; only the small result
(unique values and counts) is materialized on the client.
Examples
--------
Basic usage:
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaStringArray
>>>
>>> s = ArkoudaStringArray(["red", "blue", "red", "green", "blue", "red"])
>>> s.value_counts()
red 3
blue 2
green 1
dtype: int64
Empty input:
>>> empty = ArkoudaStringArray([])
>>> empty.value_counts()
Series([], dtype: int64)
"""
import pandas as pd
from arkouda.numpy.strings import Strings
from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray
from arkouda.pandas.groupbyclass import GroupBy
s = self._data
if s.size == 0:
return pd.Series(dtype="int64")
values, counts = GroupBy(s).size()
# For type checking:
assert isinstance(values, Strings)
if values.size == 0:
return pd.Series(dtype="int64")
return pd.Series(
ArkoudaArray._from_sequence(counts),
index=ArkoudaStringArray._from_sequence(values),
)
def _not_implemented(self, name: str):
raise NotImplementedError(f"`{name}` is not implemented for Arkouda-backed arrays yet.")
[docs]
def all(self, *args, **kwargs):
self._not_implemented("all")
[docs]
def any(self, *args, **kwargs):
self._not_implemented("any")
[docs]
def argpartition(self, *args, **kwargs):
self._not_implemented("argpartition")
[docs]
def byteswap(self, *args, **kwargs):
self._not_implemented("byteswap")
[docs]
def choose(self, *args, **kwargs):
self._not_implemented("choose")
[docs]
def clip(self, *args, **kwargs):
self._not_implemented("clip")
[docs]
def compress(self, *args, **kwargs):
self._not_implemented("compress")
[docs]
def conj(self, *args, **kwargs):
self._not_implemented("conj")
[docs]
def conjugate(self, *args, **kwargs):
self._not_implemented("conjugate")
[docs]
def cumprod(self, *args, **kwargs):
self._not_implemented("cumprod")
[docs]
def cumsum(self, *args, **kwargs):
self._not_implemented("cumsum")
[docs]
def diagonal(self, *args, **kwargs):
self._not_implemented("diagonal")
[docs]
def dot(self, *args, **kwargs):
self._not_implemented("dot")
[docs]
def dump(self, *args, **kwargs):
self._not_implemented("dump")
[docs]
def dumps(self, *args, **kwargs):
self._not_implemented("dumps")
[docs]
def fill(self, *args, **kwargs):
self._not_implemented("fill")
[docs]
def flatten(self, *args, **kwargs):
self._not_implemented("flatten")
[docs]
def getfield(self, *args, **kwargs):
self._not_implemented("getfield")
[docs]
def item(self, *args, **kwargs):
self._not_implemented("item")
[docs]
def max(self, *args, **kwargs):
self._not_implemented("max")
[docs]
def mean(self, *args, **kwargs):
self._not_implemented("mean")
[docs]
def min(self, *args, **kwargs):
self._not_implemented("min")
[docs]
def nonzero(self, *args, **kwargs):
self._not_implemented("nonzero")
[docs]
def partition(self, *args, **kwargs):
self._not_implemented("partition")
[docs]
def prod(self, *args, **kwargs):
self._not_implemented("prod")
[docs]
def put(self, *args, **kwargs):
self._not_implemented("put")
[docs]
def resize(self, *args, **kwargs):
self._not_implemented("resize")
[docs]
def round(self, *args, **kwargs):
self._not_implemented("round")
[docs]
def setfield(self, *args, **kwargs):
self._not_implemented("setfield")
[docs]
def setflags(self, *args, **kwargs):
self._not_implemented("setflags")
[docs]
def sort(self, *args, **kwargs):
self._not_implemented("sort")
[docs]
def std(self, *args, **kwargs):
self._not_implemented("std")
[docs]
def sum(self, *args, **kwargs):
self._not_implemented("sum")
[docs]
def swapaxes(self, *args, **kwargs):
self._not_implemented("swapaxes")
[docs]
def to_device(self, *args, **kwargs):
self._not_implemented("to_device")
[docs]
def tobytes(self, *args, **kwargs):
self._not_implemented("tobytes")
[docs]
def tofile(self, *args, **kwargs):
self._not_implemented("tofile")
[docs]
def trace(self, *args, **kwargs):
self._not_implemented("trace")
[docs]
def var(self, *args, **kwargs):
self._not_implemented("var")