Source code for arkouda.pandas.index

"""
Index and MultiIndex classes for Arkouda Series and DataFrames.

This module defines the foundational indexing structures used in Arkouda's
pandas-like API, supporting labeled indexing, alignment, and grouping operations.
Indexes provide the mechanism to assign meaningful labels to rows and columns.

Classes
-------
Index : class
    One-dimensional immutable sequence used to label and align axis data.
    Accepts various types of inputs including `pdarray`, `Strings`, `Categorical`,
    Python lists, or pandas Index/Categorical objects. Supports optional name and
    lightweight list-based indexing for small inputs.

MultiIndex : class
    A multi-level index for complex datasets, composed of multiple Index-like arrays
    ("levels"). Each level may contain categorical, string, or numeric values.
    Supports construction from a list of arrays or a `pandas.MultiIndex`.

Features
--------
- Flexible input types for index construction
- Support for named and multi-level indexing
- Efficient size and shape inference
- Alignment and equality comparison logic
- Integration with Arkouda Series and DataFrames

Notes
-----
- `MultiIndex` currently does **not** support construction from tuples; it must be
  created from lists of values or pandas MultiIndex objects.
- Only one-dimensional (1D) indexing is supported at this time.
- All level arrays in a `MultiIndex` must have the same length.

Examples
--------
>>> import arkouda as ak
>>> from arkouda.pandas.index import Index, MultiIndex

>>> idx = Index([10, 20, 30], name="id")
>>> idx
Index(array([10 20 30]), dtype='int64')

>>> midx = MultiIndex([ak.array([1, 2]), ak.array(["a", "b"])], names=["num", "char"])
>>> midx.nlevels
2
>>> midx.get_level_values("char")
Index(array(['a', 'b']), dtype='<U0')

See Also
--------
- arkouda.pandas.series.Series
- arkouda.pandas.categorical.Categorical

"""

from __future__ import annotations

import builtins
import json

from typing import (
    TYPE_CHECKING,
    Any,
    Hashable,
    Iterable,
    List,
    Literal,
    Optional,
    Tuple,
    TypeVar,
    Union,
)
from typing import cast as type_cast

import numpy as np
import pandas as pd

from numpy import array as ndarray
from numpy import dtype as npdtype
from typeguard import typechecked

from arkouda.numpy.dtypes import bool_ as akbool
from arkouda.numpy.dtypes import bool_scalars
from arkouda.numpy.manipulation_functions import flip as ak_flip
from arkouda.numpy.pdarrayclass import RegistrationError, pdarray
from arkouda.numpy.pdarraysetops import argsort, in1d
from arkouda.numpy.sorting import coargsort
from arkouda.numpy.util import convert_if_categorical, generic_concat, get_callback
from arkouda.pandas.groupbyclass import GroupBy, groupable, unique


__all__ = [
    "Index",
    "MultiIndex",
]

if TYPE_CHECKING:
    from arkouda.numpy.pdarraycreation import ones
    from arkouda.numpy.strings import Strings
    from arkouda.pandas.categorical import Categorical
    from arkouda.pandas.series import Series
else:
    Strings = TypeVar("Strings")
    Categorical = TypeVar("Categorical")
    Series = TypeVar("Series")


[docs] class Index: """ Sequence used for indexing and alignment. The basic object storing axis labels for all DataFrame objects. Parameters ---------- values: List, pdarray, Strings, Categorical, pandas.Categorical, pandas.Index, or Index name : str, default=None Name to be stored in the index. allow_list = False, If False, list values will be converted to a pdarray. If True, list values will remain as a list, provided the data length is less than max_list_size. max_list_size = 1000 This is the maximum allowed data length for the values to be stored as a list object. Raises ------ ValueError Raised if allow_list=True and the size of values is > max_list_size. See Also -------- MultiIndex Examples -------- >>> import arkouda as ak >>> ak.Index([1, 2, 3]) Index(array([1 2 3]), dtype='int64') >>> ak.Index(list('abc')) Index(array(['a', 'b', 'c']), dtype='<U0') >>> ak.Index([1, 2, 3], allow_list=True) Index([1, 2, 3], dtype='int64') """ objType = "Index" def _set_dtype(self): """ Infer and set the dtype of the Index based on its values. This method examines the type of `self.values` and assigns an appropriate dtype to `self.dtype`. If the type is not recognized, `self.dtype` is set to None. """ from arkouda.numpy.dtypes import dtype as ak_dtype from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical if isinstance(self.values, List): # Infer dtype from first element self.dtype = self[0].dtype elif isinstance(self.values, Strings): self.dtype = ak_dtype(str) elif isinstance(self.values, (pdarray, Categorical, pd.Index)): self.dtype = self.values.dtype else: self.dtype = None @typechecked def __init__( self, values: Union[List, pdarray, Strings, Categorical, pd.Index, "Index", pd.Categorical], name: Optional[str] = None, allow_list=False, max_list_size=1000, ): from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical self.max_list_size = max_list_size self.registered_name: Optional[str] = None if isinstance(values, pd.Categorical): values = Categorical(values) if isinstance(values, Index): self.values = values.values self.size = values.size self._set_dtype() self.name = name if name else values.name elif isinstance(values, pd.Index): if isinstance(values.values, pd.Categorical): self.values = Categorical(values.values) else: self.values = array(values.values) self.size = values.size self._set_dtype() self.name = name if name else values.name elif isinstance(values, List): if allow_list is True: if len(values) <= max_list_size: self.values = values self.size = len(values) if len(values) > 0: self.dtype = self._dtype_of_list_values(values) else: self.dtype = None else: raise ValueError( f"Cannot create Index because list size {len(values)} " f"exceeds max_list_size {self.max_list_size}." ) else: values = array(values) self.values = values self.size = self.values.size self._set_dtype() self.name = name elif isinstance(values, (pdarray, Strings, Categorical)): self.values = values self.size = self.values.size self._set_dtype() self.name = name else: raise TypeError(f"Unable to create Index from type {type(values)}") def __getitem__(self, key): """ Retrieve item(s) from the Index. Parameters ---------- key : int, list, slice, or Series The location(s) of the element(s) to retrieve. Returns ------- Index or scalar Subset of the Index or a single value, depending on the key. """ from arkouda.pandas.series import Series allow_list = False if isinstance(self.values, list): allow_list = True if isinstance(key, Series): key = key.values if isinstance(key, int): return self.values[key] if isinstance(key, list): if len(key) < self.max_list_size: return Index([self.values[k] for k in key], allow_list=allow_list) else: raise ValueError( f"Unable to get list of size greater than " f"Index.max_list_size ({self.max_list_size})." ) return Index(self.values[key], allow_list=allow_list) def __repr__(self): """ Return a string representation of the Index. Returns ------- str Printable representation of the Index object. """ return f"Index({repr(self.values)}, dtype='{self.dtype}')" def __len__(self): """ Return the number of elements in the Index. Returns ------- int Number of elements in the Index. """ return len(self.values) def _get_arrays_for_comparison( self, other ) -> Tuple[Union[pdarray, Strings, Categorical], Union[pdarray, Strings, Categorical]]: from arkouda.numpy.pdarraycreation import array if isinstance(self.values, list): values = array(self.values) else: values = self.values if isinstance(other, Index): other_values = other.values else: other_values = other if isinstance(other_values, list): other_values = array(other_values) return values, other_values def __eq__(self, other): """ Compare Index with another Index or array-like object for equality. Parameters ---------- other : Index or array-like The object to compare against. Returns ------- pdarray or bool Boolean array indicating element-wise equality. """ values, other_values = self._get_arrays_for_comparison(other) return values == other_values def __ne__(self, other): """ Compare Index with another Index or array-like object for inequality. Parameters ---------- other : Index or array-like The object to compare against. Returns ------- pdarray or bool Boolean array indicating element-wise inequality. """ values, other_values = self._get_arrays_for_comparison(other) return values != other_values def _dtype_of_list_values(self, lst): """ Infer the Arkouda dtype of a list of values, ensuring all items share the same type. Parameters ---------- lst : list List of values whose types are to be checked. Returns ------- dtype Arkouda dtype corresponding to the list elements. Raises ------ TypeError If input is not a list or contains mixed types. """ if not isinstance(lst, list): raise TypeError("Expected a list of values.") from arkouda.numpy.dtypes import dtype as akdtype first_type = akdtype(type(lst[0])) for item in lst: item_type = akdtype(type(item)) if item_type != first_type: raise TypeError( f"Values of Index must all be the same type. Found {first_type} and {item_type}." ) return first_type @property def nlevels(self): """ Integer number of levels in this Index. An Index will always have 1 level. See Also -------- MultiIndex.nlevels """ return 1 @property def ndim(self): """ Number of dimensions of the underlying data, by definition 1. See Also -------- MultiIndex.ndim """ return 1 @property def inferred_type(self) -> str: """Return a string of the type inferred from the values.""" if isinstance(self.values, list): from arkouda.numpy.dtypes import float_scalars, int_scalars from arkouda.numpy.util import _is_dtype_in_union if _is_dtype_in_union(self.dtype, int_scalars): return "integer" elif _is_dtype_in_union(self.dtype, float_scalars): return "floating" elif str(self.dtype).startswith("<U"): return "string" return self.values.inferred_type @property def names(self): """Return Index or MultiIndex names.""" return [self.name] @property def shape(self): """ Return the shape of the Index. Returns ------- tuple A tuple representing the shape of the Index (size,). """ return (self.size,) @property def is_unique(self): """ Property indicating if all values in the index are unique. Returns ------- bool - True if all values are unique, False otherwise. """ if isinstance(self.values, list): return len(set(self.values)) == self.size else: g = GroupBy(self.values) key, ct = g.size() return (ct == 1).all()
[docs] @staticmethod def factory(index): """ Construct an Index or MultiIndex based on the input. Parameters ---------- index : array-like or tuple of array-like If a single array-like, returns an Index. If a tuple of array-like objects, returns a MultiIndex. Returns ------- Index or MultiIndex An Index if input is a single array-like, or a MultiIndex otherwise. """ if isinstance(index, Index): return index elif not isinstance(index, List) and not isinstance(index, Tuple): return Index(index) else: return MultiIndex(index)
[docs] @classmethod def from_return_msg(cls, rep_msg): """ Reconstruct an Index or MultiIndex from a return message. Parameters ---------- rep_msg : str A string return message containing encoded index information. Returns ------- Index or MultiIndex The reconstructed Index or MultiIndex instance. """ from arkouda.numpy.pdarrayclass import create_pdarray, pdarray from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical data = json.loads(rep_msg) idx = [] for d in data: i_comps = d.split("+|+") if i_comps[0].lower() == pdarray.objType.lower(): idx.append(create_pdarray(i_comps[1])) elif i_comps[0].lower() == Strings.objType.lower(): idx.append(Strings.from_return_msg(i_comps[1])) elif i_comps[0].lower() == Categorical.objType.lower(): idx.append(Categorical.from_return_msg(i_comps[1])) return cls.factory(idx) if len(idx) > 1 else cls.factory(idx[0])
[docs] def equals(self, other: Index) -> bool_scalars: """ Whether Indexes are the same size, and all entries are equal. Parameters ---------- other : Index object to compare. Returns ------- bool_scalars True if the Indexes are the same, o.w. False. Examples -------- >>> import arkouda as ak >>> i = ak.Index([1, 2, 3]) >>> i_cpy = ak.Index([1, 2, 3]) >>> i.equals(i_cpy) np.True_ >>> i2 = ak.Index([1, 2, 4]) >>> i.equals(i2) np.False_ MultiIndex case: >>> arrays = [ak.array([1, 1, 2, 2]), ak.array(["red", "blue", "red", "blue"])] >>> m = ak.MultiIndex(arrays, names=["numbers2", "colors2"]) >>> m.equals(m) True >>> arrays2 = [ak.array([1, 1, 2, 2]), ak.array(["red", "blue", "red", "green"])] >>> m2 = ak.MultiIndex(arrays2, names=["numbers2", "colors2"]) >>> m.equals(m2) False """ if self is other: return True if not isinstance(other, Index): raise TypeError("other must be of type Index.") if type(self) is not type(other): return False if len(self) != len(other): return False from arkouda.numpy.pdarrayclass import all as akall if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): if self.nlevels != other.nlevels: return False for i in range(self.nlevels): if not self.levels[i].equals(other.levels[i]): return False return True else: result = akall(self == other) if isinstance(result, (bool, np.bool_)): return result return False
def _reindex(self, perm): """ Return a new Index (or MultiIndex) with values reordered by the given permutation. Parameters ---------- perm : pdarray or list The permutation indices used to reorder the Index. Returns ------- Index or MultiIndex A new Index or MultiIndex with reordered values. """ if isinstance(self, MultiIndex): # Reindex each level of the MultiIndex return MultiIndex(self[perm].levels, name=self.name, names=self.names) elif isinstance(self.values, list): # Convert perm to list if necessary (for Python-native lists) if not isinstance(perm, list): perm = perm.to_list() new_values = [self.values[i] for i in perm] return Index(new_values, name=self.name, allow_list=True) else: # Assume perm is a pdarray and self.values is an Arkouda array return Index(self.values[perm], name=self.name)
[docs] @typechecked def sort_values( self, return_indexer: bool = False, ascending: bool = True, na_position: str = "last" ) -> Union[Index, Tuple[Index, Union[pdarray, list]]]: """ Return a sorted copy of the index. Parameters ---------- return_indexer : bool, default False If True, also return the integer positions that sort the index. ascending : bool, default True Sort in ascending order. Use False for descending. na_position : {'first', 'last'}, default 'last' Where to position NaNs. 'first' puts NaNs at the beginning, 'last' at the end. Returns ------- Union[Index, Tuple[Index, Union[pdarray, list]]] sorted_index : arkouda.Index A new Index whose values are sorted. indexer : Union[arkouda.pdarray, list], optional The indices that would sort the original index. Only returned when ``return_indexer=True``. Examples -------- >>> import arkouda as ak >>> idx = ak.Index([10, 100, 1, 1000]) >>> idx Index(array([10 100 1 1000]), dtype='int64') Sort in ascending order (default): >>> idx.sort_values() Index(array([1 10 100 1000]), dtype='int64') Sort in descending order and get the sort positions: >>> idx.sort_values(ascending=False, return_indexer=True) (Index(array([1000 100 10 1]), dtype='int64'), array([3 1 0 2])) """ import numpy as np from numpy import argsort as np_argsort from numpy import flip as np_flip from numpy import isnan as np_isnan from arkouda.numpy.dtypes import is_supported_number from arkouda.numpy.numeric import isnan as ak_isnan from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraysetops import concatenate from arkouda.numpy.strings import Strings from arkouda.numpy.util import is_float from arkouda.pandas.categorical import Categorical if na_position not in {"first", "last"}: raise ValueError("na_position must be 'first' or 'last'.") perm: Union[pdarray, list] if isinstance(self, MultiIndex): perm = coargsort(self.levels, ascending=ascending) elif isinstance(self.values, list): perm = type_cast(list[int], np_argsort(self.values).tolist()) if not ascending: perm = type_cast(list[int], np_flip(perm).tolist()) if all(is_supported_number(x) for x in self.values): is_nan = np_isnan(self.values)[perm] perm_array = np.array(perm) if na_position == "last": perm = np.concatenate([perm_array[~is_nan], perm_array[is_nan]]).tolist() else: perm = np.concatenate([perm_array[is_nan], perm_array[~is_nan]]).tolist() elif isinstance(self.values, (Strings, Categorical, pdarray)): perm = argsort(self.values, ascending=ascending) if is_float(self.values): is_nan = ak_isnan(type_cast(pdarray, self.values))[perm] if na_position == "last": perm = type_cast(pdarray, concatenate([perm[~is_nan], perm[is_nan]])) else: perm = type_cast(pdarray, concatenate([perm[is_nan], perm[~is_nan]])) else: raise TypeError(f"Unsupported index dtype: {type(self.values)}") if return_indexer: return self._reindex(perm), perm else: return self._reindex(perm)
[docs] def memory_usage(self, unit="B"): """ Return the memory usage of the Index values. Parameters ---------- unit : str, default = "B" Unit to return. One of {'B', 'KB', 'MB', 'GB'}. Returns ------- int Bytes of memory consumed. See Also -------- arkouda.numpy.pdarrayclass.nbytes arkouda.pandas.index.MultiIndex.memory_usage arkouda.pandas.series.Series.memory_usage arkouda.pandas.dataframe.DataFrame.memory_usage Examples -------- >>> import arkouda as ak >>> idx = Index(ak.array([1, 2, 3])) >>> idx.memory_usage() 24 """ from arkouda.numpy.util import convert_bytes return convert_bytes(self.values.nbytes, unit=unit)
[docs] def to_pandas(self): """ Convert this Arkouda-backed index wrapper to an equivalent pandas Index. This method materializes the underlying values into a local NumPy array (or pandas Categorical, when applicable) and returns the corresponding pandas ``Index`` (or ``CategoricalIndex``). Returns ------- pandas.Index A pandas Index representing the same logical values. For categorical data, a ``pandas.CategoricalIndex`` is returned. Notes ----- - If the underlying values are categorical, this returns a ``pandas.CategoricalIndex``. - For unicode string-like data (or object arrays inferred as strings), this attempts to return a pandas "string" dtype Index to match pandas' missing-value behavior (e.g., NA handling). - Fixed-width bytes data is preserved as bytes (no implicit decoding). Examples -------- >>> import arkouda as ak >>> import pandas >>> idx = ak.Index(ak.array([1,2,3])) >>> pidx = idx.to_pandas() >>> pidx.dtype dtype('<i8') """ from arkouda.pandas.categorical import Categorical def _materialize(values): """Return a concrete local ndarray-like for pandas construction.""" if isinstance(values, list): return np.asarray(values) if hasattr(values, "to_ndarray"): return values.to_ndarray() return np.asarray(values) values = self.values # 1) Categorical: preserve CategoricalIndex behavior if isinstance(values, Categorical): cat = values.to_pandas() if isinstance(cat, pd.Index): return cat.rename(self.name) return pd.CategoricalIndex(cat, name=self.name) val = _materialize(values) dtype = getattr(val, "dtype", None) kind = getattr(dtype, "kind", None) # 2) Unicode: prefer pandas StringDtype for NA semantics. if kind == "U": return pd.Index(pd.array(val, dtype="str"), name=self.name) # 3) For stable non-string dtypes, preserve dtype explicitly to avoid inference drift. # Covers: bool, int, uint, float, complex, datetime64, timedelta64. if kind in ("b", "i", "u", "f", "c", "M", "m"): return pd.Index(val, dtype=dtype, name=self.name) # 4) Fallback: let pandas decide (covers unusual/extension-ish cases). return pd.Index(val, name=self.name)
[docs] def to_ndarray(self): """ Convert the Index values to a NumPy ndarray. Returns ------- numpy.ndarray A NumPy array representation of the Index values. """ if isinstance(self.values, list): return ndarray(self.values) else: val = convert_if_categorical(self.values) return val.to_ndarray()
[docs] def tolist(self): """ Convert the Index values to a Python list. Returns ------- list A list containing the Index values. """ if isinstance(self.values, list): return self.values else: return self.to_ndarray().tolist()
[docs] def set_dtype(self, dtype): """ Change the data type of the index. Currently only aku.ip_address and ak.array are supported. """ new_idx = dtype(self.values) self.values = new_idx return self
[docs] def register(self, user_defined_name): """ Register this Index object and underlying components with the Arkouda server. Parameters ---------- user_defined_name : str user defined name the Index is to be registered under, this will be the root name for underlying components Returns ------- Index The same Index which is now registered with the arkouda server and has an updated name. This is an in-place modification, the original is returned to support a fluid programming style. Please note you cannot register two different Indexes with the same name. Raises ------ TypeError Raised if user_defined_name is not a str RegistrationError If the server was unable to register the Index with the user_defined_name See Also -------- unregister, attach, is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.pandas.categorical import Categorical if isinstance(self.values, list): raise TypeError("Index cannot be registered when values are list type.") from arkouda.core.client import generic_msg if self.registered_name is not None and self.is_registered(): raise RegistrationError(f"This object is already registered as {self.registered_name}") generic_msg( cmd="register", args={ "name": user_defined_name, "objType": self.objType, "num_idxs": 1, "idx_names": [ ( json.dumps( { "codes": self.values.codes.name, "categories": self.values.categories.name, "NA_codes": self.values._akNAcode.name, **( {"permutation": self.values.permutation.name} if self.values.permutation is not None else {} ), **( {"segments": self.values.segments.name} if self.values.segments is not None else {} ), } ) if isinstance(self.values, Categorical) else self.values.name ) ], "idx_types": [self.values.objType], }, ) self.registered_name = user_defined_name return self
[docs] def unregister(self): """ Unregister this Index object in the arkouda server. Unregister this Index object in the arkouda server, which was previously registered using register() and/or attached to using attach(). Raises ------ RegistrationError If the object is already unregistered or if there is a server error when attempting to unregister See Also -------- register, attach, is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.numpy.util import unregister if not self.registered_name: raise RegistrationError("This object is not registered") unregister(self.registered_name) self.registered_name = None
[docs] def is_registered(self): """ Return whether the object is registered. Return True iff the object is contained in the registry or is a component of a registered object. Returns ------- numpy.bool Indicates if the object is contained in the registry Raises ------ RegistrationError Raised if there's a server-side error or a mis-match of registered components See Also -------- register, attach, unregister Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.numpy.util import is_registered from arkouda.pandas.categorical import Categorical if self.registered_name is None: if not isinstance(self.values, Categorical): return is_registered(self.values.name, as_component=True) else: result = True result &= is_registered(self.values.codes.name, as_component=True) result &= is_registered(self.values.categories.name, as_component=True) result &= is_registered(self.values._akNAcode.name, as_component=True) if self.values.permutation is not None and self.values.segments is not None: result &= is_registered(self.values.permutation.name, as_component=True) result &= is_registered(self.values.segments.name, as_component=True) return result else: return is_registered(self.registered_name)
[docs] def to_dict(self, label): """ Convert the Index to a dictionary with a specified label. Parameters ---------- label : str or list of str The key to use in the resulting dictionary. If a list is provided, only the first element is used. If None, defaults to "idx". Returns ------- dict A dictionary with the label as the key and the Index as the value. """ data = {} if label is None: label = "idx" elif isinstance(label, list): label = label[0] data[label] = self.values return data
def _check_types(self, other): """ Ensure that the type of the other object matches this Index. Parameters ---------- other : Index The object to compare against. Raises ------ TypeError If the types of the two objects do not match. """ if type(self) is not type(other): raise TypeError("Index Types must match") def _merge(self, other): """ Merge this Index with another, removing duplicates. Parameters ---------- other : Index The Index to merge with this one. Returns ------- Index A new Index containing the unique values from both indices. Raises ------ TypeError If the types of the two Index objects do not match. """ self._check_types(other) callback = get_callback(self.values) idx = generic_concat([self.values, other.values], ordered=False) return Index(callback(unique(idx))) def _merge_all(self, idx_list): """ Merge this Index with a list of other Index objects, removing duplicates. Parameters ---------- idx_list : list of Index A list of Index objects to merge with this one. Returns ------- Index A new Index containing the unique values from all merged indices. Raises ------ TypeError If any object in the list is not the same type as this Index. """ idx = self.values callback = get_callback(idx) for other in idx_list: self._check_types(other) idx = generic_concat([idx, other.values], ordered=False) return Index(callback(unique(idx))) def _check_aligned(self, other): """ Check whether this Index is aligned with another. Two indices are considered aligned if they have the same length and all corresponding elements are equal. Parameters ---------- other : Index The Index to compare against. Returns ------- bool True if the indices are aligned, False otherwise. Raises ------ TypeError If the types of the two Index objects do not match. """ self._check_types(other) length = len(self) return len(other) == length and (self == other.values).sum() == length
[docs] def argsort(self, ascending: bool = True) -> Union[list, pdarray]: """ Return the permutation that sorts the Index. Parameters ---------- ascending : bool, optional If True (default), sort in ascending order. If False, sort in descending order. Returns ------- list or pdarray Indices that would sort the Index. Examples -------- >>> import arkouda as ak >>> idx = ak.Index([10, 3, 5]) >>> idx.argsort() array([1 2 0]) """ if isinstance(self.values, list): reverse = not ascending return sorted(range(self.size), key=self.values.__getitem__, reverse=reverse) if hasattr(self.values, "argsort"): return self.values.argsort(ascending=ascending) raise TypeError(f"Index values of type {type(self.values)} do not support argsort")
[docs] def map(self, arg: Union[dict, "Series"]) -> "Index": """ Map values of Index according to an input mapping. Parameters ---------- arg : dict or Series The mapping correspondence. Returns ------- arkouda.pandas.index.Index A new index with the values transformed by the mapping correspondence. Raises ------ TypeError Raised if arg is not of type dict or arkouda.pandas.Series. Raised if index values not of type pdarray, Categorical, or Strings. Examples -------- >>> import arkouda as ak >>> idx = ak.Index(ak.array([2, 3, 2, 3, 4])) >>> idx Index(array([2 3 2 3 4]), dtype='int64') >>> idx.map({4: 25.0, 2: 30.0, 1: 7.0, 3: 5.0}) Index(array([30.00000000000000000 5.00000000000000000 30.00000000000000000 5.00000000000000000 25.00000000000000000]), dtype='float64') >>> s2 = ak.Series(ak.array(["a","b","c","d"]), index = ak.array([4,2,1,3])) >>> idx.map(s2) Index(array(['b', 'd', 'b', 'd', 'a']), dtype='<U0') """ from arkouda.numpy.util import map return Index(map(self.values, arg))
[docs] def concat(self, other): """ Concatenate this Index with another Index. Parameters ---------- other : Index The Index to concatenate with this one. Returns ------- Index A new Index with values from both indices. Raises ------ TypeError If the types of the two Index objects do not match. """ self._check_types(other) idx = generic_concat([self.values, other.values], ordered=True) other_name = getattr(other, "name", None) name = self.name if self.name == other_name else None return Index(idx, name=name)
[docs] def lookup(self, key): """ Check for presence of key(s) in the Index. Parameters ---------- key : pdarray or scalar The value(s) to look up in the Index. If a scalar is provided, it will be converted to a one-element array. Returns ------- pdarray A boolean array of length ``len(self)``, indicating which entries of the Index are present in `key`. Raises ------ TypeError If `key` cannot be converted to an arkouda array. """ from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array if not isinstance(key, pdarray): # try to handle single value try: key = array([key]) except Exception: raise TypeError("Lookup must be on an arkouda array") return in1d(self.values, key)
[docs] def to_hdf( self, prefix_path: str, dataset: str = "index", mode: Literal["truncate", "append"] = "truncate", file_type: Literal["single", "distribute"] = "distribute", ) -> str: """ Save the Index to HDF5. The object can be saved to a collection of files or single file. Parameters ---------- prefix_path : str Directory and filename prefix that all output files share dataset : str Name of the dataset to create in files (must not already exist) mode : str {'truncate' | 'append'} By default, truncate (overwrite) output files, if they exist. If 'append', attempt to create new dataset in existing files. file_type: str ("single" | "distribute") Default: "distribute" When set to single, dataset is written to a single file. When distribute, dataset is written on a file per locale. This is only supported by HDF5 files and will have no impact of Parquet Files. Returns ------- string message indicating result of save operation Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray TypeError Raised if the Index values are a list. Notes ----- - The prefix_path must be visible to the arkouda server and the user must have write permission. - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>`` ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, the file name will be `prefix_path`. - If any of the output files already exist and the mode is 'truncate', they will be overwritten. If the mode is 'append' and the number of output files is less than the number of locales or a dataset with the same name already exists, a ``RuntimeError`` will result. - Any file extension can be used.The file I/O does not rely on the extension to determine the file format. """ from arkouda.core.client import generic_msg from arkouda.pandas.categorical import Categorical as Categorical_ from arkouda.pandas.io import _file_type_to_int, _mode_str_to_int if isinstance(self.values, list): raise TypeError("Unable to write Index to hdf when values are a list.") index_data = [ ( self.values.name if not isinstance(self.values, (Categorical_)) else json.dumps( { "codes": self.values.codes.name, "categories": self.values.categories.name, "NA_codes": self.values._akNAcode.name, **( {"permutation": self.values.permutation.name} if self.values.permutation is not None else {} ), **( {"segments": self.values.segments.name} if self.values.segments is not None else {} ), } ) ) ] return generic_msg( cmd="tohdf", args={ "filename": prefix_path, "dset": dataset, "file_format": _file_type_to_int(file_type), "write_mode": _mode_str_to_int(mode), "objType": self.objType, "num_idx": 1, "idx": index_data, "idx_objTypes": [self.values.objType], # this will be pdarray, strings, or cat "idx_dtypes": [str(self.values.dtype)], }, )
[docs] def update_hdf( self, prefix_path: str, dataset: str = "index", repack: bool = True, ): """ Overwrite the dataset with the name provided with this Index object. If the dataset does not exist it is added. Parameters ---------- prefix_path : str Directory and filename prefix that all output files share dataset : str Name of the dataset to create in files repack: bool Default: True HDF5 does not release memory on delete. When True, the inaccessible data (that was overwritten) is removed. When False, the data remains, but is inaccessible. Setting to false will yield better performance, but will cause file sizes to expand. Raises ------ RuntimeError Raised if a server-side error is thrown saving the index Notes ----- - If file does not contain File_Format attribute to indicate how it was saved, the file name is checked for _LOCALE#### to determine if it is distributed. - If the dataset provided does not exist, it will be added - Because HDF5 deletes do not release memory, this will create a copy of the file with the new data """ from arkouda.core.client import generic_msg from arkouda.pandas.categorical import Categorical as Categorical_ from arkouda.pandas.io import ( _file_type_to_int, _get_hdf_filetype, _mode_str_to_int, _repack_hdf, ) # determine the format (single/distribute) that the file was saved in file_type = _get_hdf_filetype(prefix_path + "*") index_data = [ ( self.values.name if not isinstance(self.values, (Categorical_)) else json.dumps( { "codes": self.values.codes.name, "categories": self.values.categories.name, "NA_codes": self.values._akNAcode.name, **( {"permutation": self.values.permutation.name} if self.values.permutation is not None else {} ), **( {"segments": self.values.segments.name} if self.values.segments is not None else {} ), } ) ) ] ( generic_msg( cmd="tohdf", args={ "filename": prefix_path, "dset": dataset, "file_format": _file_type_to_int(file_type), "write_mode": _mode_str_to_int("append"), "objType": self.objType, "num_idx": 1, "idx": index_data, "idx_objTypes": [self.values.objType], # this will be pdarray, strings, or cat "idx_dtypes": [str(self.values.dtype)], "overwrite": True, }, ), ) if repack: _repack_hdf(prefix_path)
[docs] def to_parquet( self, prefix_path: str, dataset: str = "index", mode: Literal["truncate", "append"] = "truncate", compression: Optional[str] = None, ): """ Save the Index to Parquet. The result is a collection of files, one file per locale of the arkouda server, where each filename starts with prefix_path. Each locale saves its chunk of the array to its corresponding file. Parameters ---------- prefix_path : str Directory and filename prefix that all output files share dataset : str Name of the dataset to create in files (must not already exist) mode : {'truncate' | 'append'} By default, truncate (overwrite) output files, if they exist. If 'append', attempt to create new dataset in existing files. compression : str (Optional) (None | "snappy" | "gzip" | "brotli" | "zstd" | "lz4") Sets the compression type used with Parquet files Returns ------- string message indicating result of save operation Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray TypeError Raised if the Index values are a list. Notes ----- - The prefix_path must be visible to the arkouda server and the user must have write permission. - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>`` ranges from 0 to ``numLocales`` for `file_type='distribute'`. - 'append' write mode is supported, but is not efficient. - If any of the output files already exist and the mode is 'truncate', they will be overwritten. If the mode is 'append' and the number of output files is less than the number of locales or a dataset with the same name already exists, a ``RuntimeError`` will result. - Any file extension can be used.The file I/O does not rely on the extension to determine the file format. """ if isinstance(self.values, list): raise TypeError("Unable to write Index to parquet when values are a list.") return self.values.to_parquet(prefix_path, dataset=dataset, mode=mode, compression=compression)
[docs] @typechecked def to_csv( self, prefix_path: str, dataset: str = "index", col_delim: str = ",", overwrite: bool = False, ): r""" Write Index to CSV file(s). File will contain a single column with the pdarray data. All CSV Files written by Arkouda include a header denoting data types of the columns. Parameters ---------- prefix_path: str The filename prefix to be used for saving files. Files will have _LOCALE#### appended when they are written to disk. dataset: str Column name to save the pdarray under. Defaults to "array". col_delim: str Defaults to ",". Value to be used to separate columns within the file. Please be sure that the value used DOES NOT appear in your dataset. overwrite: bool Defaults to False. If True, any existing files matching your provided prefix_path will be overwritten. If False, an error will be returned if existing files are found. Returns ------- str reponse message Raises ------ ValueError Raised if all datasets are not present in all parquet files or if one or more of the specified files do not exist. RuntimeError Raised if one or more of the specified files cannot be opened. If `allow_errors` is true this may be raised if no values are returned from the server. TypeError Raised if we receive an unknown arkouda_type returned from the server. Raised if the Index values are a list. Notes ----- - CSV format is not currently supported by load/load_all operations - The column delimiter is expected to be the same for column names and data - Be sure that column delimiters are not found within your data. - All CSV files must delimit rows using newline (`\n`) at this time. """ if isinstance(self.values, list): raise TypeError("Unable to write Index to csv when values are a list.") return self.values.to_csv(prefix_path, dataset=dataset, col_delim=col_delim, overwrite=overwrite)
[docs] class MultiIndex(Index): """ A multi-level, or hierarchical, index object for Arkouda DataFrames and Series. A MultiIndex allows you to represent multiple dimensions of indexing using a single object, enabling advanced indexing and grouping operations. This class mirrors the behavior of pandas' MultiIndex while leveraging Arkouda's distributed data structures. Internally, it stores a list of Index objects, each representing one level of the hierarchy. Examples -------- >>> import arkouda as ak >>> from arkouda.pandas.index import MultiIndex >>> a = ak.array([1, 2, 3]) >>> b = ak.array(['a', 'b', 'c']) >>> mi = MultiIndex([a, b]) >>> mi[1] MultiIndex([np.int64(2), np.str_('b')]) """ from arkouda.numpy.dtypes import int_scalars objType = "MultiIndex" _name: str | None _names: Iterable[Union[Hashable, None]] levels: list[Union[pdarray, Strings, Categorical]] size: int_scalars registered_name: Union[str, None] def __init__( self, data: Union[list, tuple, pd.MultiIndex, MultiIndex], name: Optional[str] = None, names: Optional[Iterable[Union[Hashable, None]]] = None, ): from arkouda.numpy.pdarraycreation import array from arkouda.pandas.categorical import Categorical self.registered_name: Optional[str] = None if isinstance(data, MultiIndex): self.levels = data.levels elif isinstance(data, pd.MultiIndex): self.levels = [ ( Categorical(data.get_level_values(i).values) if isinstance(data.get_level_values(i).values, pd.Categorical) else array(data.get_level_values(i).values) ) for i in range(data.nlevels) ] elif isinstance(data, (list, tuple)): self.levels = list(data) else: raise TypeError("MultiIndex should be an iterable, ak.MultiIndex, or pd.MutiIndex") first = True for col in self.levels: # col can be a python int which doesn't have a size attribute col_size = col.size if not isinstance(col, int) else 0 if first: # we are implicitly assuming levels contains arkouda types and not python lists # because we are using obj.size/obj.dtype instead of len(obj)/type(obj) # this should be made explict using typechecking self.size = col_size first = False else: if col_size != self.size: raise ValueError("All columns in MultiIndex must have same length") if not name and isinstance(data, (MultiIndex, pd.MultiIndex)) and isinstance(data.name, str): self._name = data.name else: self._name = name if names is not None: self._names = list(names) elif isinstance(data, (MultiIndex, pd.MultiIndex)) and data.names: self._names = list(data.names) else: self._names = [None for _i in range(len(self.levels))] def __getitem__(self, key): """ Retrieve item(s) from the MultiIndex. Parameters ---------- key : int, slice, list, or Series The position(s) or boolean mask used to index each component Index. If a Series is provided, its levels are used for indexing. Returns ------- MultiIndex A new MultiIndex with components indexed by `key`. """ from arkouda.pandas.series import Series if isinstance(key, Series): key = key.levels return MultiIndex([i[key] for i in self.index]) def __repr__(self): """ Return a string representation of the MultiIndex. Returns ------- str A printable representation of the MultiIndex object. """ return f"MultiIndex({repr(self.index)})" def __len__(self): """ Return the number of elements in the MultiIndex. Returns ------- int Number of elements in the Index. """ return len(self.index) def __eq__(self, v): """ Check element-wise equality between this MultiIndex and another. Parameters ---------- v : MultiIndex, list, or tuple The object to compare with. Must be another MultiIndex or a list/tuple of Index components. Returns ------- pdarray A boolean array indicating where the two MultiIndex objects are equal. Raises ------ TypeError If the input is not a MultiIndex, list, or tuple. """ if not isinstance(v, (list, tuple, MultiIndex)): raise TypeError("Cannot compare MultiIndex to a scalar") retval = ones(len(self), dtype=akbool) if isinstance(v, MultiIndex): v = v.index for a, b in zip(self.index, v): retval &= a == b return retval @property def names(self): """Return Index or MultiIndex names.""" return self._names @property def name(self): """Return Index or MultiIndex name.""" return self._name @property def index(self): """ Return the levels of the MultiIndex. Returns ------- list A list of Index objects representing the levels of the MultiIndex. """ return self.levels @property def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. See Also -------- Index.nlevels """ return len(self.levels) @property def ndim(self): """ Number of dimensions of the underlying data, by definition 1. See Also -------- Index.ndim """ return 1 @property def inferred_type(self) -> str: """ Return the inferred type of the MultiIndex. Returns ------- str The string "mixed", indicating the MultiIndex may contain multiple types. """ return "mixed" @property def dtype(self) -> npdtype: """Return the dtype object of the underlying data.""" return npdtype("O")
[docs] def get_level_values(self, level: Union[str, int]): """ Return the values at a particular level of the MultiIndex. Parameters ---------- level : int or str The level number or name. If a string is provided, it must match an entry in `self.names`. Returns ------- Index An Index object corresponding to the requested level. Raises ------ RuntimeError If `self.names` is None and a string level is provided. ValueError If the provided string is not in `self.names`, or if the level index is out of bounds. """ if isinstance(level, str): if self.names is None: raise RuntimeError("Cannot get level values because Index.names is None.") elif level not in self.names: raise ValueError( f'Cannot get level values because level "{level}" is not in Index.names.' ) elif isinstance(self.names, list): level = self.names.index(level) if isinstance(level, int) and abs(level) < self.nlevels: name = None if isinstance(self.names, list): name = self.names[level] return Index(self.levels[level], name=name) else: raise ValueError( "Cannot get level values because level must be a string in names or " "an integer with absolute value less than the number of levels." )
[docs] def equal_levels(self, other: MultiIndex) -> builtins.bool: """Return True if the levels of both MultiIndex objects are the same.""" if self.nlevels != other.nlevels: return False for i in range(self.nlevels): if not self.levels[i].equals(other.levels[i]): return False return True
[docs] def memory_usage(self, unit="B"): """ Return the memory usage of the MultiIndex levels. Parameters ---------- unit : str, default = "B" Unit to return. One of {'B', 'KB', 'MB', 'GB'}. Returns ------- int Bytes of memory consumed. See Also -------- arkouda.numpy.pdarrayclass.nbytes arkouda.pandas.index.Index.memory_usage arkouda.pandas.series.Series.memory_usage arkouda.pandas.dataframe.DataFrame.memory_usage Examples -------- >>> import arkouda as ak >>> m = ak.pandas.index.MultiIndex([ak.array([1,2,3]),ak.array([4,5,6])]) >>> m.memory_usage() 48 """ from arkouda.numpy.util import convert_bytes nbytes = 0 for item in self.levels: nbytes += item.nbytes return convert_bytes(nbytes, unit=unit)
[docs] def to_pandas(self): """ Convert the MultiIndex to a pandas.MultiIndex object. Returns ------- pandas.MultiIndex A pandas MultiIndex with the same levels and names. Notes ----- Categorical levels are converted to pandas categorical arrays, while others are converted to NumPy arrays. """ from arkouda.pandas.categorical import Categorical mi = pd.MultiIndex.from_arrays( [i.to_pandas() if isinstance(i, Categorical) else i.to_ndarray() for i in self.index], names=self.names, ) mi.name = self.name return mi
[docs] def set_dtype(self, dtype): """ Change the data type of the index. Currently only aku.ip_address and ak.array are supported. """ new_idx = [dtype(i) for i in self.index] self.index = new_idx return self
[docs] def to_ndarray(self): """ Convert the MultiIndex to a NumPy ndarray of arrays. Returns ------- numpy.ndarray A NumPy array where each element is an array corresponding to one level of the MultiIndex. Categorical levels are converted to their underlying arrays. """ return ndarray([convert_if_categorical(val).to_ndarray() for val in self.levels])
[docs] def tolist(self): """ Convert the MultiIndex to a list of lists. Returns ------- list A list of Python lists, where each inner list corresponds to one level of the MultiIndex. """ return self.to_ndarray().tolist()
[docs] def register(self, user_defined_name): """ Register this Index object and underlying components with the Arkouda server. Parameters ---------- user_defined_name : str user defined name the Index is to be registered under, this will be the root name for underlying components Returns ------- MultiIndex The same Index which is now registered with the arkouda server and has an updated name. This is an in-place modification, the original is returned to support a fluid programming style. Please note you cannot register two different Indexes with the same name. Raises ------ TypeError Raised if user_defined_name is not a str RegistrationError If the server was unable to register the Index with the user_defined_name See Also -------- unregister, attach, is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.core.client import generic_msg from arkouda.pandas.categorical import Categorical if self.registered_name is not None and self.is_registered(): raise RegistrationError(f"This object is already registered as {self.registered_name}") generic_msg( cmd="register", args={ "name": user_defined_name, "objType": self.objType, "num_idxs": len(self.levels), "idx_names": [ ( json.dumps( { "codes": v.codes.name, "categories": v.categories.name, "NA_codes": v._akNAcode.name, **( {"permutation": v.permutation.name} if v.permutation is not None else {} ), **({"segments": v.segments.name} if v.segments is not None else {}), } ) if isinstance(v, Categorical) else v.name ) for v in self.levels ], "idx_types": [v.objType for v in self.levels], }, ) self.registered_name = user_defined_name return self
[docs] def unregister(self): """ Unregister this MultiIndex from the Arkouda server. Raises ------ RegistrationError If the MultiIndex is not currently registered. """ from arkouda.numpy.util import unregister if not self.registered_name: raise RegistrationError("This object is not registered") unregister(self.registered_name) self.registered_name = None
[docs] def is_registered(self): """ Check if the MultiIndex is registered with the Arkouda server. Returns ------- bool True if the MultiIndex has a registered name and is recognized by the server, False otherwise. """ from arkouda.numpy.util import is_registered if self.registered_name is None: return False return is_registered(self.registered_name)
[docs] def to_dict(self, labels=None): """ Convert the MultiIndex to a dictionary representation. Parameters ---------- labels : list of str, optional A list of column names for the index levels. If not provided, defaults to ['idx_0', 'idx_1', ..., 'idx_n']. Returns ------- dict A dictionary mapping each label to the corresponding Index object. """ data = {} if labels is None: labels = [f"idx_{i}" for i in range(len(self.index))] for i, value in enumerate(self.index): data[labels[i]] = value return data
def _merge(self, other): """ Merge this MultiIndex with another MultiIndex, removing duplicates. Parameters ---------- other : MultiIndex The other MultiIndex to merge with. Returns ------- MultiIndex A new MultiIndex containing the unique values from both inputs. Raises ------ TypeError If the type of `other` does not match. """ self._check_types(other) idx = [generic_concat([ix1, ix2], ordered=False) for ix1, ix2 in zip(self.index, other.index)] return MultiIndex(GroupBy(idx).unique_keys) def _merge_all(self, array): """ Merge this MultiIndex with a list of MultiIndex objects, removing duplicates. Parameters ---------- array : list of MultiIndex A list of MultiIndex objects to merge with. Returns ------- MultiIndex A new MultiIndex containing the unique values from all inputs. Raises ------ TypeError If any element in `array` is not a MultiIndex or has a different type. """ idx = self.index for other in array: self._check_types(other) idx = [generic_concat([ix1, ix2], ordered=False) for ix1, ix2 in zip(idx, other.index)] return MultiIndex(GroupBy(idx).unique_keys)
[docs] def argsort(self, ascending=True): """ Return the indices that would sort the MultiIndex. Parameters ---------- ascending : bool, default True If False, the result is in descending order. Returns ------- pdarray An array of indices that would sort the MultiIndex. """ i = coargsort(self.index) if not ascending: i = ak_flip(i) return i
[docs] def concat(self, other): """ Concatenate this MultiIndex with another, preserving duplicates and order. Parameters ---------- other : MultiIndex The other MultiIndex to concatenate with. Returns ------- MultiIndex A new MultiIndex containing values from both inputs, preserving order. Raises ------ TypeError If the type of `other` does not match. """ self._check_types(other) idx = [generic_concat([ix1, ix2], ordered=True) for ix1, ix2 in zip(self.index, other.index)] other_names = getattr(other, "names", None) names = self.names if self.names == other_names else None return MultiIndex(idx, names=names)
[docs] def lookup(self, key: list[Any] | tuple[Any, ...]) -> groupable: """ Perform element-wise lookup on the MultiIndex. Parameters ---------- key : list or tuple A sequence of values, one for each level of the MultiIndex. - If the elements are scalars (e.g., ``(1, "red")``), they are treated as a single row key: the result is a boolean mask over rows where all levels match the corresponding scalar. - If the elements are arkouda arrays (e.g., list of pdarrays / Strings), they must align one-to-one with the levels, and the lookup is delegated to ``in1d(self.index, key)`` for multi-column membership. Returns ------- groupable A boolean array indicating which rows in the MultiIndex match the key. Raises ------ TypeError If `key` is not a list or tuple. ValueError If the length of `key` does not match the number of levels. """ from arkouda.numpy.pdarraycreation import array as ak_array from arkouda.numpy.strings import Strings if not isinstance(key, (list, tuple)): types = [type(k).__name__ for k in key] raise TypeError( f"MultiIndex.lookup expects a list or tuple of keys, one per level. Received {types}." ) if len(key) != self.nlevels: raise ValueError( f"MultiIndex.lookup key length {len(key)} must match number of levels {self.nlevels}" ) # Case 1: user passed per-level arkouda arrays. # We assume they are already the correct types and lengths. is_array_mode = all(isinstance(k, (pdarray, Strings)) for k in key) if is_array_mode: return in1d(self.index, key) # Don't allow mixed scalar/array keys. is_any_array = any(isinstance(k, (pdarray, Strings)) for k in key) if is_any_array and not is_array_mode: raise TypeError( "MultiIndex.lookup key must be all scalars (row key) or all arkouda arrays " "(per-level membership). " f"Received mixed types: {[type(k) for k in key]}" ) # Case 2: user passed scalars (e.g., (1, "red")). # Convert each scalar to a length-1 arkouda array, preserving per-level dtypes. scalar_key_arrays = [] for i, v in enumerate(key): lvl = self.levels[i] # Determine the dtype for this level dt = lvl.dtype a = ak_array([v], dtype=dt) # make length-1 array scalar_key_arrays.append(a) return in1d(self.index, scalar_key_arrays)
[docs] def to_hdf( self, prefix_path: str, dataset: str = "index", mode: Literal["truncate", "append"] = "truncate", file_type: Literal["single", "distribute"] = "distribute", ) -> str: """ Save the Index to HDF5. The object can be saved to a collection of files or single file. Parameters ---------- prefix_path : str Directory and filename prefix that all output files share dataset : str Name of the dataset to create in files (must not already exist) mode : {'truncate' | 'append'} By default, truncate (overwrite) output files, if they exist. If 'append', attempt to create new dataset in existing files. file_type: {"single" | "distribute"} Default: "distribute" When set to single, dataset is written to a single file. When distribute, dataset is written on a file per locale. This is only supported by HDF5 files and will have no impact of Parquet Files. Returns ------- string message indicating result of save operation Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray. Notes ----- - The prefix_path must be visible to the arkouda server and the user must have write permission. - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>`` ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, the file name will be `prefix_path`. - If any of the output files already exist and the mode is 'truncate', they will be overwritten. If the mode is 'append' and the number of output files is less than the number of locales or a dataset with the same name already exists, a ``RuntimeError`` will result. - Any file extension can be used.The file I/O does not rely on the extension to determine the file format. """ from arkouda.core.client import generic_msg from arkouda.pandas.categorical import Categorical as Categorical_ from arkouda.pandas.io import _file_type_to_int, _mode_str_to_int index_data = [ ( obj.name if not isinstance(obj, (Categorical_)) else json.dumps( { "codes": obj.codes.name, "categories": obj.categories.name, "NA_codes": obj._akNAcode.name, **({"permutation": obj.permutation.name} if obj.permutation is not None else {}), **({"segments": obj.segments.name} if obj.segments is not None else {}), } ) ) for obj in self.levels ] return generic_msg( cmd="tohdf", args={ "filename": prefix_path, "dset": dataset, "file_format": _file_type_to_int(file_type), "write_mode": _mode_str_to_int(mode), "objType": self.objType, "num_idx": len(self.levels), "idx": index_data, "idx_objTypes": [obj.objType for obj in self.levels], "idx_dtypes": [str(obj.dtype) for obj in self.levels], }, )
[docs] def update_hdf( self, prefix_path: str, dataset: str = "index", repack: bool = True, ): """ Overwrite the dataset with the name provided with this Index object. If the dataset does not exist it is added. Parameters ---------- prefix_path : str Directory and filename prefix that all output files share dataset : str Name of the dataset to create in files repack: bool Default: True HDF5 does not release memory on delete. When True, the inaccessible data (that was overwritten) is removed. When False, the data remains, but is inaccessible. Setting to false will yield better performance, but will cause file sizes to expand. Raises ------ RuntimeError Raised if a server-side error is thrown saving the index TypeError Raised if the Index levels are a list. Notes ----- - If file does not contain File_Format attribute to indicate how it was saved, the file name is checked for _LOCALE#### to determine if it is distributed. - If the dataset provided does not exist, it will be added - Because HDF5 deletes do not release memory, this will create a copy of the file with the new data """ from arkouda.core.client import generic_msg from arkouda.pandas.categorical import Categorical as Categorical_ from arkouda.pandas.io import ( _file_type_to_int, _get_hdf_filetype, _mode_str_to_int, _repack_hdf, ) if isinstance(self.levels, list): raise TypeError("Unable update hdf when Index levels are a list.") # determine the format (single/distribute) that the file was saved in file_type = _get_hdf_filetype(prefix_path + "*") index_data = [ ( obj.name if not isinstance(obj, (Categorical_)) else json.dumps( { "codes": obj.codes.name, "categories": obj.categories.name, "NA_codes": obj._akNAcode.name, **({"permutation": obj.permutation.name} if obj.permutation is not None else {}), **({"segments": obj.segments.name} if obj.segments is not None else {}), } ) ) for obj in self.levels ] ( generic_msg( cmd="tohdf", args={ "filename": prefix_path, "dset": dataset, "file_format": _file_type_to_int(file_type), "write_mode": _mode_str_to_int("append"), "objType": self.objType, "num_idx": len(self.levels), "idx": index_data, "idx_objTypes": [obj.objType for obj in self.levels], "idx_dtypes": [str(obj.dtype) for obj in self.levels], "overwrite": True, }, ), ) if repack: _repack_hdf(prefix_path)