Source code for arkouda.pandas.series

from __future__ import annotations

import json
import operator

from builtins import str as builtin_str
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, TypeVar, Union, cast

import numpy as np
import pandas as pd

from pandas._config import get_option
from typeguard import typechecked

import arkouda.pandas.dataframe

from arkouda.numpy.dtypes import bool_scalars, dtype, float64, int64
from arkouda.numpy.pdarrayclass import RegistrationError, any, argmaxk, create_pdarray, pdarray
from arkouda.numpy.pdarraysetops import argsort, concatenate, in1d, indexof1d
from arkouda.numpy.util import get_callback, is_float
from arkouda.pandas.accessor import CachedAccessor, DatetimeAccessor, StringAccessor
from arkouda.pandas.groupbyclass import GroupBy, groupable, groupable_element_type
from arkouda.pandas.index import Index, MultiIndex


if TYPE_CHECKING:
    from arkouda.numpy import cast as akcast
    from arkouda.numpy.alignment import lookup
    from arkouda.numpy.pdarraycreation import arange, zeros
    from arkouda.numpy.segarray import SegArray
    from arkouda.numpy.strings import Strings
    from arkouda.pandas.categorical import Categorical
else:
    Categorical = TypeVar("Categorical")
    SegArray = TypeVar("SegArray")
    Strings = TypeVar("Strings")

# pd.set_option("display.max_colwidth", 65) is being called in DataFrame.py. This will resolve BitVector
# truncation issues. If issues arise, that's where to look for it.

__all__ = [
    "Series",
]


supported_scalars = Union[int, float, bool, builtin_str, np.int64, np.float64, np.bool_, np.str_]


def is_supported_scalar(x) -> bool:
    return isinstance(x, (int, float, bool, builtin_str, np.int64, np.float64, np.bool_, np.str_))


def natural_binary_operators(cls) -> type:
    for name, op in {
        "__add__": operator.add,
        "__sub__": operator.sub,
        "__mul__": operator.mul,
        "__truediv__": operator.truediv,
        "__floordiv__": operator.floordiv,
        "__and__": operator.and_,
        "__or__": operator.or_,
        "__xor__": operator.xor,
        "__eq__": operator.eq,
        "__ge__": operator.ge,
        "__gt__": operator.gt,
        "__le__": operator.le,
        "__lshift__": operator.lshift,
        "__lt__": operator.lt,
        "__mod__": operator.mod,
        "__ne__": operator.ne,
        "__rshift__": operator.rshift,
        "__pow__": operator.pow,
    }.items():
        setattr(cls, name, cls._make_binop(op))

    return cls


def unary_operators(cls) -> type:
    for name, op in {
        "__invert__": operator.invert,
        "__neg__": operator.neg,
    }.items():
        setattr(cls, name, cls._make_unaryop(op))

    return cls


[docs] @unary_operators @natural_binary_operators class Series: """ One-dimensional Arkouda array with axis labels. Parameters ---------- index : pdarray or Strings, optional An array of indices associated with the data array. If not provided (or empty), it defaults to a range of ints whose size matches the size of the data. data : tuple, list, groupable_element_type, Series, or SegArray A 1D array-like. Must not be None. Raises ------ TypeError Raised if ``index`` is not a pdarray or Strings object. Raised if ``data`` is not a supported type. ValueError Raised if the index size does not match the data size. Notes ----- The Series class accepts either positional arguments or keyword arguments. Positional arguments - ``Series(data)``: ``data`` is provided and an index is generated automatically. - ``Series(data, index)``: both ``data`` and ``index`` are provided. Keyword arguments - ``Series(data=..., index=...)``: ``index`` is optional but must match the size of ``data`` when provided. """ objType = "Series" @typechecked def __init__( self, data: Union[ Tuple, List, groupable_element_type, Series, SegArray, pd.Series, pd.Categorical, ], name=None, index: Optional[Union[pdarray, Strings, Tuple, List, Index]] = None, ): from arkouda.numpy.pdarraycreation import arange, array from arkouda.numpy.segarray import SegArray from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical if isinstance(data, pd.Categorical): data = Categorical(data) self.registered_name: Optional[str] = None if index is None and isinstance(data, (tuple, list)) and len(data) == 2: # handles the previous `ar_tuple` case if not isinstance(data[0], (pdarray, Index, Strings, Categorical, list, tuple)): raise TypeError("indices must be a pdarray, Strings, Categorical, List, or Tuple") if not isinstance(data[1], (pdarray, Strings, Categorical, Series, SegArray)): raise TypeError("values must be a pdarray, Strings, SegArray, or Categorical") self.values = data[1] if not isinstance(data[1], Series) else data[1].values self.index = Index.factory(index) if index else Index.factory(data[0]) elif isinstance(data, pd.Series): if isinstance(data.values, pd.Categorical): self.values = Categorical(data.values) else: self.values = array(data.values) self.index = Index(data.index) self.name = data.name elif isinstance(data, tuple) and len(data) != 2: raise TypeError("Series initialization requries a tuple of (index, values)") else: # When only 1 positional argument it will be treated as data and not index if isinstance(data, Series): self.values = data.values elif isinstance(data, List): self.values = array(data) else: self.values = data self.index = Index.factory(index) if index is not None else Index(arange(len(self.values))) if self.index.size != self.values.size: raise ValueError( "Index size does not match data size: {} != {}".format(self.index.size, self.values.size) ) if name is None and isinstance(data, (Series, pd.Series)): self.name = data.name else: self.name = name self.size = self.index.size def __len__(self): return self.values.size def __repr__(self): """Return ascii-formatted version of the series.""" if len(self) == 0: return "Series([ -- ][ 0 values : 0 B])" maxrows = pd.get_option("display.max_rows") if len(self) <= maxrows: prt = self.to_pandas() length_str = "" else: prt = pd.concat( [ self.head(maxrows // 2 + 2).to_pandas(), self.tail(maxrows // 2).to_pandas(), ] ) length_str = f"\nLength {len(self)}" return ( prt.to_string( dtype=prt.dtype, min_rows=get_option("display.min_rows"), max_rows=maxrows, length=False, ) + length_str )
[docs] def validate_key( self, key: Union[Series, pdarray, Strings, Categorical, List, supported_scalars, SegArray], ) -> Union[pdarray, Strings, Categorical, supported_scalars, SegArray]: """ Validate type requirements for keys when reading or writing the Series. Also converts list and tuple arguments into pdarrays. Parameters ---------- key : Series, pdarray, Strings, Categorical, List, supported_scalars, or SegArray The key or container of keys that might be used to index into the Series. Returns ------- The validated key(s), with lists and tuples converted to pdarrays Raises ------ TypeError Raised if keys are not boolean values or the type of the labels Raised if key is not one of the supported types KeyError Raised if container of keys has keys not present in the Series IndexError Raised if the length of a boolean key array is different from the Series """ from arkouda.numpy.pdarraycreation import arange, array from arkouda.numpy.strings import Strings if isinstance(key, list): return self.validate_key(array(key)) if isinstance(key, tuple): raise TypeError("Series does not support tuple keys") if isinstance(key, Series): # @TODO align the series indexes return self.validate_key(key.values) if is_supported_scalar(key): if dtype(type(key)) != self.index.dtype: raise TypeError( "Unexpected key type. Received {} but expected {}. key: {}".format( dtype(type(key)), self.index.dtype, key ) ) elif isinstance(key, Strings): if self.index.dtype != dtype(str): raise TypeError( "Unexpected key type. Received Strings but expected {}".format(self.index.dtype) ) if any(~in1d(key, self.index.values)): raise KeyError("{} not in index".format(key[~in1d(key, self.index.values)])) elif isinstance(key, pdarray): if key.dtype == self.index.dtype: if any(~in1d(key, self.index.values)): raise KeyError("{} not in index".format(key[~in1d(key, self.index.values)])) elif key.dtype == "bool_": if key.size != self.index.size: raise IndexError( "Boolean index has wrong length: {} instead of {}".format(key.size, self.size) ) else: raise TypeError( "Unexpected key type. Received {} but expected {}".format( dtype(type(key)), self.index.dtype ) ) elif isinstance(key, slice): start = key.start if key.start is not None else 0 stop = key.stop if key.stop is not None else self.size stride = key.step if key.step is not None else 1 if start < 0: raise IndexError("Slice start must be non-negative") if stop > self.size: raise IndexError("Slice stop must be less than or equal to the size of the Series") if start > stop: raise IndexError("Slice start must be less than or equal to the stop") key = arange(start, stop, stride) else: raise TypeError( "Series [] only supports indexing by scalars, lists of scalars, " "and arrays of scalars. Received {}".format(type(key)) ) return key
@typechecked def __getitem__(self, _key: Union[supported_scalars, pdarray, Strings, List, Series]): """ Get values from Series. Parameters ---------- _key : supported_scalars, pdarray, Strings, List, or Series The key or container of keys to get entries for. Returns ------- Series with all entries with matching labels. If only one entry in the Series is accessed, returns a scalar. """ from arkouda.numpy.pdarraycreation import array from arkouda.numpy.strings import Strings key = self.validate_key(_key) if is_supported_scalar(key): return self[array([key])] assert isinstance(key, (pdarray, Strings)) if isinstance(key, pdarray) and key.dtype == "bool_": # boolean array indexes without sorting return Series(index=self.index[key], data=self.values[key]) indices = indexof1d(key, self.index.values) if len(indices) == 1: return self.values[indices[0]] else: return Series(index=self.index[indices], data=self.values[indices])
[docs] def validate_val( self, val: Union[pdarray, Strings, supported_scalars, List] ) -> Union[pdarray, Strings, supported_scalars]: """ Validate type requirements for values being written into the Series. Also converts list and tuple arguments into pdarrays. Parameters ---------- val : pdarray, Strings, supported_scalars, or List The value or container of values that might be assigned into the Series. Returns ------- The validated value, with lists converted to pdarrays Raises ------ TypeError Raised if val is not the same type or a container with elements of the same time as the Series Raised if val is a string or Strings type. Raised if val is not one of the supported types """ from typing import get_args from arkouda.numpy.pdarraycreation import array from arkouda.numpy.strings import Strings if isinstance(val, list): return array(val) if isinstance(val, get_args(supported_scalars)): if dtype(type(val)) != self.values.dtype: raise TypeError( "Unexpected value type. Received {} but expected {}".format( dtype(type(val)), self.values.dtype ) ) if isinstance(val, str): raise TypeError("Cannot modify string type dataframes") return val elif isinstance(val, Strings): raise TypeError("Cannot modify string type dataframes") elif isinstance(val, pdarray): if val.dtype != self.values.dtype: raise TypeError( "Unexpected value type. Received {} but expected {}".format( dtype(type(val)), self.values.dtype ) ) return val else: raise TypeError("cannot set with unsupported value type: {}".format(type(val)))
def __setitem__( self, key: Union[pdarray, Strings, Categorical, Series, List, supported_scalars, SegArray], val: Union[pdarray, Strings, List, supported_scalars], ) -> None: """ Set or adds entries in a Series by label. Parameters ---------- key : pdarray, Strings, Categorical, Series, List, supported_scalars, or SegArray The key or container of keys to set entries for. val : pdarray, Strings, List, or supported_scalars The value or values to set/add to the Series. Raises ------ ValueError Raised when setting multiple values to a Series with repeated labels Raised when number of values provided does not match the number of entries to set. """ from arkouda.numpy.pdarraycreation import array from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical val = self.validate_val(val) key = self.validate_key(key) if isinstance(key, (pdarray, Strings)) and len(key) > 1 and self.has_repeat_labels(): raise ValueError("Cannot set with multiple keys for Series with repeated labels.") indices = None if is_supported_scalar(key): indices = self.index == key else: # mypy: key may be scalar/SegArray/etc, but in1d only accepts groupables if not isinstance(key, (pdarray, Strings, Categorical, list, tuple)): raise TypeError(f"Unsupported key type for membership test: {type(key)}") # If key is a python list/tuple, it will be validated/converted by validate_key in many paths # but if it slips through, convert here. if ( isinstance(self.index, MultiIndex) and isinstance(key, tuple) and len(key) == self.index.nlevels ): indices = self.index.lookup(key) # returns boolean mask else: if isinstance(key, list): key = array(key) indices = in1d(self.index.values, cast(groupable, key)) tf, counts = GroupBy(indices).size() update_count = counts[1] if len(counts) == 2 else 0 if update_count == 0: # adding a new entry if isinstance(val, (pdarray, Strings)): raise ValueError("Cannot set. Too many values provided") new_index_values = concatenate([self.index.values, array([key])]) self.index = Index.factory(new_index_values) self.values = concatenate([self.values, array([val])]) return if is_supported_scalar(val): cast(Any, self.values)[indices] = val return else: val_array = cast(Union[pdarray, Strings], val) if val_array.size == 1 and is_supported_scalar(key): cast(Any, self.values)[indices] = val_array[0] return if update_count != val_array.size: raise ValueError( "Cannot set using a list-like indexer with a different length from the value" ) cast(Any, self.values)[indices] = val return
[docs] def memory_usage(self, index: bool = True, unit: Literal["B", "KB", "MB", "GB"] = "B") -> int: """ Return the memory usage of the Series. The memory usage can optionally include the contribution of the index. Parameters ---------- index : bool Specifies whether to include the memory usage of the Series index. Defaults to True. unit : {"B", "KB", "MB", "GB"} Unit to return. One of {'B', 'KB', 'MB', 'GB'}. Defaults to "B". Returns ------- int Bytes of memory consumed. See Also -------- arkouda.numpy.pdarrayclass.nbytes arkouda.Index.memory_usage arkouda.pandas.series.Series.memory_usage arkouda.pandas.datafame.DataFrame.memory_usage Examples -------- >>> import arkouda as ak >>> from arkouda.pandas.series import Series >>> s = ak.Series(ak.arange(3)) >>> s.memory_usage() 48 Not including the index gives the size of the rest of the data, which is necessarily smaller: >>> s.memory_usage(index=False) 24 Select the units: >>> s = ak.Series(ak.arange(3000)) >>> s.memory_usage(unit="KB") 46.875 """ from arkouda.numpy.util import convert_bytes v = cast(int, convert_bytes(self.values.nbytes, unit=unit)) if index: v += self.index.memory_usage(unit=unit) return v
[docs] def has_repeat_labels(self) -> bool: """Return whether the Series has any labels that appear more than once.""" tf, counts = GroupBy(self.index.values).size() return counts.size != self.index.size
[docs] def to_ndarray(self) -> np.ndarray: return self.values.to_ndarray()
@property def ndim(self) -> int: return 1 @property def loc(self) -> _LocIndexer: """ Accesses entries of a Series by label. Returns ------- _LocIndexer An indexer for label-based access to Series entries. """ return _LocIndexer(self) @property def at(self) -> _LocIndexer: """ Accesses entries of a Series by label. Returns ------- _LocIndexer An indexer for label-based access to Series entries. """ return _LocIndexer(self) @property def iloc(self) -> _iLocIndexer: """ Accesses entries of a Series by position. Returns ------- _iLocIndexer An indexer for position-based access to Series entries. """ return _iLocIndexer("iloc", self) @property def iat(self) -> _iLocIndexer: """ Accesses entries of a Series by position. Returns ------- _iLocIndexer An indexer for position-based access to a single element. """ return _iLocIndexer("iat", self) dt = CachedAccessor("dt", DatetimeAccessor) str = CachedAccessor("str", StringAccessor) @property def shape(self) -> Tuple[int]: # mimic the pandas return of series shape property return (len(self.values),) @property def dtype(self) -> np.dtype: return self.values.dtype
[docs] @typechecked def isin(self, lst: Union[pdarray, Strings, List]) -> Series: """ Find Series elements whose values are in the specified list. Parameters ---------- lst : pdarray, Strings, or List Either a Python list or an Arkouda array to check membership against. Returns ------- Series A Series of booleans that is True for elements found in the list, and False otherwise. """ from arkouda.numpy.pdarraycreation import array from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical if isinstance(lst, list): lst = array(lst) # mypy: lst/self.values can be a wider union (SegArray/Any) at type level. # At runtime, in1d only supports pdarray/Strings/Categorical (or sequences of those). if not isinstance(self.values, (pdarray, Strings, Categorical)): raise TypeError(f"in1d not supported for Series values type: {type(self.values)}") if not isinstance(lst, (pdarray, Strings, Categorical, list, tuple)): raise TypeError(f"in1d not supported for list type: {type(lst)}") if isinstance(lst, (list, tuple)): lst = array(lst) boolean = in1d( cast(groupable_element_type, self.values), cast(groupable_element_type, lst), ) return Series(data=boolean, index=self.index)
[docs] @typechecked def locate(self, key: Union[int, pdarray, Index, Series, List, Tuple]) -> Series: """ Lookup values by index label. Parameters ---------- key : int, pdarray, Index, Series, List, or Tuple The key or keys to look up. This can be: - A scalar - A list of scalars - A list of lists (for MultiIndex) - A Series (in which case labels are preserved, and its values are used as keys) Keys will be converted to Arkouda arrays as needed. Returns ------- Series A Series containing the values corresponding to the key. """ from arkouda.numpy.pdarraycreation import array def is_scalar_label(x) -> bool: # scalar label component (NOT array-like) return not isinstance(x, (pdarray, Index, Series, list, tuple)) def to_pdarray(obj) -> pdarray: """ Convert without ever touching pandas/numpy containers. Assumes Arkouda Index/Series store pdarrays internally. """ if isinstance(obj, pdarray): return obj if isinstance(obj, Index): # Arkouda Index wrapper: underlying pdarray is on .index return obj.index if isinstance(obj, Series): # Arkouda Series wrapper: underlying pdarray is on .values values = obj.values if isinstance(values, pdarray): return values # python scalar / python list/tuple -> arkouda pdarray (server-side) return array(obj) def rebuild_mi_with_names(mi: MultiIndex, names) -> MultiIndex: # Do not rely on mi.names setter being functional return MultiIndex(mi.levels, names=list(names)) def finalize(selector) -> Series: out_index = self.index[selector] if isinstance(out_index, MultiIndex) and isinstance(self.index, MultiIndex): out_index = rebuild_mi_with_names(out_index, self.index.names) return Series(index=out_index, data=self.values[selector]) # ---- Series key: preserve its index, lookup by its values (Arkouda Series) if isinstance(key, Series): return Series(index=key.index, data=lookup(self.index.values, self.values, key.values)) # ---- Direct index objects if isinstance(key, MultiIndex): return finalize(self.index.lookup(key.index)) if isinstance(key, Index): return finalize(self.index.lookup(key.values)) # ---- pdarray key if isinstance(key, pdarray): return finalize(self.index.lookup(key)) # ---- list/tuple keys if isinstance(key, (list, tuple)): if isinstance(self.index, MultiIndex): nlevels = self.index.nlevels if len(key) != nlevels: raise TypeError( "For MultiIndex Series, 'key' must be a tuple label, an Index/MultiIndex, " "or per-level keys with length equal to nlevels." ) # Reject flat list-of-scalars like [0, 2] if isinstance(key, list): all_scalar = True for k in key: if not is_scalar_label(k): all_scalar = False break if all_scalar: raise TypeError( "For MultiIndex Series, a single label must be a tuple, e.g. (0, 2), " "not a flat list like [0, 2]." ) # Single scalar label tuple: (0, 10) -> per-level length-1 arrays if isinstance(key, tuple): all_scalar = True for k in key: if not is_scalar_label(k): all_scalar = False break if all_scalar: per_level = [array([k]) for k in key] return finalize(self.index.lookup(per_level)) # Per-level keys: normalize each element without pandas/numpy per_level = [to_pdarray(k) for k in key] # Enforce paired selection: equal sizes (metadata only) sizes = [int(k.size) for k in per_level] if len(set(sizes)) != 1: raise ValueError( f"Per-level MultiIndex keys must have the same length; got {sizes}." ) return finalize(self.index.lookup(per_level)) # Non-MultiIndex: # - list of scalars -> convert to pdarray # - nested list/tuple -> transpose using pure Python (keys only) key0 = key[0] if isinstance(key0, (list, tuple)): cols = list(zip(*key)) per_level = [array(col) for col in cols] # col is a tuple of python scalars return finalize(self.index.lookup(per_level)) if isinstance(key0, pdarray): return finalize(self.index.lookup(key)) return finalize(self.index.lookup(to_pdarray(key))) # ---- scalar key return finalize(self.index == key)
@classmethod def _make_binop(cls, operator): def binop(self, other) -> Series: if isinstance(other, Series): if self.index._check_aligned(other.index): return cls((self.index, operator(self.values, other.values))) else: idx = self.index._merge(other.index).index a = lookup(self.index.values, self.values, idx, fillvalue=0) b = lookup(other.index.values, other.values, idx, fillvalue=0) return cls((idx, operator(a, b))) else: return cls((self.index, operator(self.values, other))) return binop @classmethod def _make_unaryop(cls, operator): def unaryop(self) -> Series: return cls((self.index, operator(self.values))) return unaryop
[docs] def max(self): return self.values.max()
[docs] def min(self): return self.values.min()
[docs] def mean(self): return self.values.mean()
[docs] def sum(self): return self.values.sum()
[docs] def std(self): return self.values.std()
[docs] def var(self): return self.values.var()
[docs] def argmax(self): return self.values.argmax()
[docs] def argmin(self): return self.values.argmin()
[docs] def prod(self): return self.values.prod()
[docs] @typechecked def add(self, b: Series) -> Series: index = self.index.concat(b.index).values values = concatenate([self.values, b.values], ordered=False) idx, vals = GroupBy(index).sum(values) return Series(data=vals, index=idx)
[docs] @typechecked def topn(self, n: int = 10) -> Series: """ Return the top values of the Series. Parameters ---------- n : int Number of values to return. Defaults to 10. Returns ------- Series A new Series containing the top `n` values. """ k = self.index v = self.values idx = argmaxk(v, n) idx = idx[-1 : -n - 1 : -1] return Series(index=k.values[idx], data=v[idx])
def _reindex(self, idx): if isinstance(self.index, MultiIndex): new_index = MultiIndex(self.index[idx].levels, name=self.index.name, names=self.index.names) elif isinstance(self.index, Index): new_index = Index(self.index[idx], name=self.index.name) else: new_index = Index(self.index[idx]) return Series(index=new_index, data=self.values[idx])
[docs] @typechecked def sort_index(self, ascending: bool = True) -> Series: """ Sort the Series by its index. Parameters ---------- ascending : bool Whether to sort the index in ascending (default) or descending order. Defaults to True. Returns ------- Series A new Series sorted by index. """ idx = self.index.argsort(ascending=ascending) return self._reindex(idx)
[docs] @typechecked def sort_values(self, ascending: bool = True) -> Series: """ Sort the Series by its values. Parameters ---------- ascending : bool Whether to sort values in ascending (default) or descending order. Defaults to True. Returns ------- Series A new Series sorted by its values. """ values_any = cast(Any, self.values) if not ascending: if isinstance(self.values, pdarray) and self.values.dtype in ( int64, float64, ): # For numeric values, negation reverses sort order idx = argsort(-self.values) else: # For non-numeric values, need the descending arange because reverse slicing # is not supported idx = argsort(values_any)[arange(self.values.size - 1, -1, -1)] else: idx = argsort(values_any) return self._reindex(idx)
[docs] @typechecked def tail(self, n: int = 10) -> Series: """Return the last n values of the series.""" idx_series = self.index[-n:] return Series(index=idx_series.values, data=self.values[-n:])
[docs] @typechecked def head(self, n: int = 10) -> Series: """Return the first n values of the series.""" idx_series = self.index[0:n] return Series(index=idx_series.values, data=self.values[0:n])
[docs] @typechecked def to_pandas(self) -> pd.Series: """Convert the series to a local PANDAS series.""" import copy from arkouda.numpy.segarray import SegArray from arkouda.pandas.categorical import Categorical idx = self.index.to_pandas() val: Any if isinstance(self.values, Categorical): val = self.values.to_pandas() elif isinstance(self.values, SegArray): # pandas errors when ndarray formatted like a segarray is # passed into Series but works when it's just a list of lists val = self.values.tolist() else: val = self.values.to_ndarray() if isinstance(self.name, str): name = copy.copy(self.name) return pd.Series(val, index=idx, name=name) else: return pd.Series(val, index=idx)
[docs] def to_markdown(self, mode="wt", index=True, tablefmt="grid", storage_options=None, **kwargs): r""" Print Series in Markdown-friendly format. Parameters ---------- mode : str, optional Mode in which file is opened, "wt" by default. index : bool, optional, default True Add index (row) labels. tablefmt: str = "grid" Table format to call from tablulate: https://pypi.org/project/tabulate/ storage_options: dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. **kwargs These parameters will be passed to tabulate. Note ---- This function should only be called on small Series as it calls pandas.Series.to_markdown: https://pandas.pydata.org/docs/reference/api/pandas.Series.to_markdown.html Examples -------- >>> import arkouda as ak >>> s = ak.Series(["elk", "pig", "dog", "quetzal"], name="animal") >>> print(s.to_markdown()) +----+----------+ | | animal | +====+==========+ | 0 | elk | +----+----------+ | 1 | pig | +----+----------+ | 2 | dog | +----+----------+ | 3 | quetzal | +----+----------+ Output markdown with a tabulate option. >>> print(s.to_markdown(tablefmt="grid")) +----+----------+ | | animal | +====+==========+ | 0 | elk | +----+----------+ | 1 | pig | +----+----------+ | 2 | dog | +----+----------+ | 3 | quetzal | +----+----------+ """ return self.to_pandas().to_markdown( mode=mode, index=index, tablefmt=tablefmt, storage_options=storage_options, **kwargs, )
[docs] @typechecked() def tolist(self) -> list: p = self.to_pandas() return p.tolist()
[docs] @typechecked def value_counts(self, sort: bool = True) -> Series: """ Return a Series containing counts of unique values. Parameters ---------- sort : bool Whether to sort the result by count in descending order. If False, the order of the results is not guaranteed. Defaults to True. Returns ------- Series A Series where the index contains the unique values and the values are their counts in the original Series. """ from arkouda.numpy import value_counts dtype = get_callback(self.values) idx, vals = value_counts(self.values) s = Series(index=idx, data=vals) if sort: s = s.sort_values(ascending=False) s.index.set_dtype(dtype) return s
[docs] @typechecked def diff(self) -> Series: """ Diffs consecutive values of the series. Returns a new series with the same index and length. First value is set to NaN. """ values = zeros(len(self), "float64") if not isinstance(self.values, Categorical): values[1:] = akcast(self.values[1:] - self.values[:-1], "float64") values[0] = np.nan else: raise TypeError("Diff not supported on Series built from Categorical.") return Series(data=values, index=self.index)
[docs] @typechecked def to_dataframe( self, index_labels: Union[List[builtin_str], None] = None, value_label: Union[builtin_str, None] = None, ) -> arkouda.pandas.dataframe.DataFrame: """ Convert the Series to an Arkouda DataFrame. Parameters ---------- index_labels : list of str or None, optional Column name(s) to label the index. value_label : str or None, optional Column name to label the values. Returns ------- DataFrame An Arkouda DataFrame representing the Series. """ list_value_label = [value_label] if isinstance(value_label, str) else value_label return Series.concat([self], axis=1, index_labels=index_labels, value_labels=list_value_label)
[docs] @typechecked def register(self, user_defined_name: builtin_str): """ Register this Series object and underlying components with the Arkouda server. Parameters ---------- user_defined_name : builtin_str User-defined name the Series is to be registered under. This will be the root name for the underlying components. Returns ------- Series The same Series which is now registered with the arkouda server and has an updated name. This is an in-place modification, the original is returned to support a fluid programming style. Please note you cannot register two different Series with the same name. Raises ------ TypeError Raised if user_defined_name is not a str RegistrationError If the server was unable to register the Series with the user_defined_name See Also -------- unregister, attach, is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.core.client import generic_msg from arkouda.pandas.categorical import Categorical if self.registered_name is not None and self.is_registered(): raise RegistrationError(f"This object is already registered as {self.registered_name}") generic_msg( cmd="register", args={ "name": user_defined_name, "objType": self.objType, "num_idxs": 1, "idx_names": [ ( json.dumps( { "codes": self.index.values.codes.name, "categories": self.index.values.categories.name, "NA_codes": self.index.values._akNAcode.name, **( {"permutation": self.index.values.permutation.name} if self.index.values.permutation is not None else {} ), **( {"segments": self.index.values.segments.name} if self.index.values.segments is not None else {} ), } ) if isinstance(self.index.values, Categorical) else self.index.values.name ) ], "idx_types": [self.index.values.objType], "values": ( json.dumps( { "codes": self.values.codes.name, "categories": self.values.categories.name, "NA_codes": self.values._akNAcode.name, **( {"permutation": self.values.permutation.name} if self.values.permutation is not None else {} ), **( {"segments": self.values.segments.name} if self.values.segments is not None else {} ), } ) if isinstance(self.values, Categorical) else cast(Any, self.values).name ), "val_type": self.values.objType, }, ) self.registered_name = user_defined_name return self
[docs] def unregister(self): """ Unregister this Series object in the arkouda server which was previously registered using register() and/or attached to using attach(). Raises ------ RegistrationError If the object is already unregistered or if there is a server error when attempting to unregister See Also -------- register, attach, is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.numpy.util import unregister if not self.registered_name: raise RegistrationError("This object is not registered") unregister(self.registered_name) self.registered_name = None
[docs] @typechecked def is_registered(self) -> bool: """ Return True iff the object is contained in the registry or is a component of a registered object. Returns ------- bool Indicates if the object is contained in the registry Raises ------ RegistrationError Raised if there's a server-side error or a mis-match of registered components See Also -------- register, attach, unregister Notes ----- Objects registered with the server are immune to deletion until they are unregistered. """ from arkouda.numpy.util import is_registered if self.registered_name is None: return False else: return is_registered(self.registered_name)
[docs] @classmethod @typechecked def from_return_msg(cls, rep_msg: builtin_str) -> Series: """ Return a Series instance pointing to components created by the arkouda server. The user should not call this function directly. Parameters ---------- rep_msg : builtin_str + delimited string containing the values and indexes. Returns ------- Series A Series representing a set of pdarray components on the server. Raises ------ RuntimeError Raised if a server-side error is thrown in the process of creating the Series instance. """ from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical values: Union[pdarray, Strings, Categorical] data = json.loads(rep_msg) val_comps = data["value"].split("+|+") if val_comps[0] == Categorical.objType.upper(): values = Categorical.from_return_msg(val_comps[1]) elif val_comps[0] == Strings.objType.upper(): values = Strings.from_return_msg(val_comps[1]) else: values = create_pdarray(val_comps[1]) index = Index.from_return_msg(data["index"]) return cls(values, index)
@staticmethod @typechecked def _all_aligned(array: List) -> bool: """Return whether all Series in the array are index-aligned.""" itor = iter(array) a1 = next(itor).index for a2 in itor: if a1._check_aligned(a2.index) is False: return False return True
[docs] @staticmethod @typechecked def concat( arrays: List, axis: int = 0, index_labels: Union[List[builtin_str], None] = None, value_labels: Union[List[builtin_str], None] = None, ordered: bool = False, ) -> Union[arkouda.pandas.dataframe.DataFrame, Series]: """ Concatenate a list of Arkouda Series or grouped arrays horizontally or vertically. If a list of grouped Arkouda arrays is passed, they are converted to Series. Each grouping is a 2-tuple where the first item is the key(s) and the second is the value. If concatenating horizontally (axis=1), all series/groupings must have the same length and the same index. The index is converted to a column in the resulting DataFrame; if it's a MultiIndex, each level is converted to a separate column. Parameters ---------- arrays : List A list of Series or groupings (tuples of index and values) to concatenate. axis : int The axis to concatenate along: - 0 = vertical (stack series into one) - 1 = horizontal (align by index and produce a DataFrame) Defaults to 0. index_labels : List[str] or None, optional Column name(s) to label the index when axis=1. value_labels : List[str] or None, optional Column names to label the values of each Series. ordered : bool Unused parameter. Reserved for future support of deterministic vs. performance-optimized concatenation. Defaults to False. Returns ------- Series or DataFrame - If axis=0: a new Series - If axis=1: a new DataFrame """ from arkouda.numpy.alignment import lookup if len(arrays) == 0: raise IndexError("Array length must be non-zero") types = {type(x) for x in arrays} if len(types) != 1: raise TypeError(f"Items must all have same type: {types}") if isinstance(arrays[0], tuple): arrays = [Series(i) for i in arrays] if axis == 1: # Horizontal concat if value_labels is None: value_labels = [f"val_{i}" for i in range(len(arrays))] if Series._all_aligned(arrays): data = next(iter(arrays)).index.to_dict(index_labels) if value_labels is not None: # Expect value_labels to always be not None; were doing the check for mypy for col, label in zip(arrays, value_labels): data[str(label)] = col.values else: aitor = iter(arrays) idx = next(aitor).index idx = idx._merge_all([i.index for i in aitor]) data = idx.to_dict(index_labels) if value_labels is not None: # Expect value_labels to always be not None; were doing the check for mypy for col, label in zip(arrays, value_labels): data[str(label)] = lookup(col.index.values, col.values, idx.values, fillvalue=0) return arkouda.pandas.dataframe.DataFrame(data) else: # Vertical concat idx = arrays[0].index v = arrays[0].values for other in arrays[1:]: idx = idx.concat(other.index) v = concatenate([v, other.values], ordered=True) return Series(index=idx.values, data=v)
[docs] def map(self, arg: Union[dict, Series]) -> Series: """ Map values of Series according to an input mapping. Parameters ---------- arg : dict or Series The mapping correspondence. Returns ------- Series A new series with the same index as the caller. When the input Series has Categorical values, the return Series will have Strings values. Otherwise, the return type will match the input type. Raises ------ TypeError Raised if arg is not of type dict or arkouda.Series. Raised if series values not of type pdarray, Categorical, or Strings. Examples -------- >>> import arkouda as ak >>> s = ak.Series(ak.array([2, 3, 2, 3, 4])) >>> s 0 2 1 3 2 2 3 3 4 4 dtype: int64 >>> s.map({4: 25.0, 2: 30.0, 1: 7.0, 3: 5.0}) 0 30.0 1 5.0 2 30.0 3 5.0 4 25.0 dtype: float64 >>> s2 = ak.Series(ak.array(["a","b","c","d"]), index = ak.array([4,2,1,3])) >>> s.map(s2) 0 b 1 d 2 b 3 d 4 a dtype: ... """ from arkouda import Series from arkouda.numpy.strings import Strings from arkouda.numpy.util import map from arkouda.pandas.categorical import Categorical if not isinstance(self.values, (pdarray, Strings, Categorical)): raise TypeError("Series values must be of type pdarray, Categorical, or Strings to use map") return Series(map(self.values, arg), index=self.index)
[docs] def isna(self) -> Series: """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. NA values, such as numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings '' are not considered NA values. Returns ------- Series Mask of bool values for each element in Series that indicates whether an element is an NA value. Examples -------- >>> import arkouda as ak >>> from arkouda import Series >>> import numpy as np >>> s = Series(ak.array([1, 2, np.nan]), index = ak.array([1, 2, 4])) >>> s.isna() 1 False 2 False 4 True dtype: bool """ from arkouda.numpy import isnan from arkouda.numpy.pdarraycreation import full from arkouda.numpy.segarray import SegArray if isinstance(self.values, SegArray): raise TypeError("isna is not supported for SegArray-backed Series") if not is_float(self.values): return Series(full(self.values.size, False, dtype=bool), index=self.index) return Series(isnan(self.values), index=self.index)
[docs] def isnull(self) -> Series: """ Series.isnull is an alias for Series.isna. Detect missing values. Return a boolean same-sized object indicating if the values are NA. NA values, such as numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings '' are not considered NA values. Returns ------- Series Mask of bool values for each element in Series that indicates whether an element is an NA value. Examples -------- >>> import arkouda as ak >>> from arkouda import Series >>> import numpy as np >>> s = Series(ak.array([1, 2, np.nan]), index = ak.array([1, 2, 4])) >>> s.isnull() 1 False 2 False 4 True dtype: bool """ return self.isna()
[docs] def notna(self) -> Series: """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. Non-missing values get mapped to True. Characters such as empty strings '' are not considered NA values. NA values, such as numpy.NaN, get mapped to False values. Returns ------- Series Mask of bool values for each element in Series that indicates whether an element is not an NA value. Examples -------- >>> import arkouda as ak >>> from arkouda import Series >>> import numpy as np >>> s = Series(ak.array([1, 2, np.nan]), index = ak.array([1, 2, 4])) >>> s.notna() 1 True 2 True 4 False dtype: bool """ from arkouda.numpy import isnan from arkouda.numpy.pdarraycreation import full from arkouda.numpy.segarray import SegArray if isinstance(self.values, SegArray): raise TypeError("isna is not supported for SegArray-backed Series") if not is_float(self.values): return Series(full(self.values.size, True, dtype=bool), index=self.index) return Series(~isnan(self.values), index=self.index)
[docs] def notnull(self) -> Series: """ Series.notnull is an alias for Series.notna. Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. Non-missing values get mapped to True. Characters such as empty strings '' are not considered NA values. NA values, such as numpy.NaN, get mapped to False values. Returns ------- Series Mask of bool values for each element in Series that indicates whether an element is not an NA value. Examples -------- >>> import arkouda as ak >>> from arkouda import Series >>> import numpy as np >>> s = Series(ak.array([1, 2, np.nan]), index = ak.array([1, 2, 4])) >>> s.notnull() 1 True 2 True 4 False dtype: bool """ return self.notna()
[docs] def hasnans(self) -> bool_scalars: """ Return True if there are any NaNs. Returns ------- bool Examples -------- >>> import arkouda as ak >>> from arkouda import Series >>> import numpy as np >>> s = ak.Series(ak.array([1, 2, 3, np.nan])) >>> s 0 1.0 1 2.0 2 3.0 3 NaN dtype: float64 >>> s.hasnans() np.True_ """ from arkouda.numpy import isnan from arkouda.numpy.segarray import SegArray if isinstance(self.values, SegArray): raise TypeError("isna is not supported for SegArray-backed Series") if is_float(self.values): result = any(isnan(self.values)) if isinstance(result, (bool, np.bool_)): return result return False
[docs] def fillna(self, value: Union[supported_scalars, Series, pdarray]) -> Series: """ Fill NA/NaN values using the specified method. Parameters ---------- value : supported_scalars, Series, or pdarray Value to use to fill holes (e.g. 0), alternately a Series of values specifying which value to use for each index. Values not in the Series will not be filled. This value cannot be a list. Returns ------- Series Object with missing values filled. Examples -------- >>> import arkouda as ak >>> from arkouda import Series >>> import numpy as np >>> data = ak.Series([1, np.nan, 3, np.nan, 5]) >>> data 0 1.0 1 NaN 2 3.0 3 NaN 4 5.0 dtype: float64 >>> fill_values1 = ak.ones(5) >>> data.fillna(fill_values1) 0 1.0 1 1.0 2 3.0 3 1.0 4 5.0 dtype: float64 >>> fill_values2 = Series(ak.ones(5)) >>> data.fillna(fill_values2) 0 1.0 1 1.0 2 3.0 3 1.0 4 5.0 dtype: float64 >>> fill_values3 = 100.0 >>> data.fillna(fill_values3) 0 1.0 1 100.0 2 3.0 3 100.0 4 5.0 dtype: float64 """ import typing as t from arkouda.numpy import isnan, where from arkouda.numpy.segarray import SegArray from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical # Normalize `value` to the underlying thing value_: Union[supported_scalars, pdarray, Strings, Categorical, SegArray] if isinstance(value, Series): value_ = value.values else: value_ = value # scalar or pdarray # Only float pdarray supports NaN fill if isinstance(self.values, pdarray) and is_float(self.values): # For a float Series, the fill value must be numeric scalar or pdarray if isinstance(value_, (Strings, Categorical, SegArray)): raise TypeError("fillna for float Series requires a numeric scalar or pdarray") value_num = t.cast(Union[supported_scalars, pdarray], value_) return Series(where(isnan(self.values), value_num, self.values), index=self.index) # Non-float: current behavior is "no-op" return Series(self.values, index=self.index)
[docs] @staticmethod @typechecked def pdconcat( arrays: List, axis: int = 0, labels: Union[Strings, None] = None ) -> Union[pd.Series, pd.DataFrame]: """ Concatenate a list of Arkouda Series or grouped arrays, returning a local pandas object. If a list of grouped Arkouda arrays is passed, they are converted to Series. Each grouping is a 2-tuple with the first item being the key(s) and the second the value. If `axis=1` (horizontal), each Series or grouping must have the same length and the same index. The index is converted to a column in the resulting DataFrame. If it is a MultiIndex, each level is converted to a separate column. Parameters ---------- arrays : List A list of Series or groupings (tuples of index and values) to concatenate. axis : int The axis along which to concatenate: - 0 = vertical (stack into a Series) - 1 = horizontal (align by index into a DataFrame) Defaults to 0. labels : Strings or None, optional Names to assign to the resulting columns in the DataFrame. Returns ------- Series or DataFrame - If axis=0: a local pandas Series - If axis=1: a local pandas DataFrame """ if len(arrays) == 0: raise IndexError("Array length must be non-zero") types = {type(x) for x in arrays} if len(types) != 1: raise TypeError(f"Items must all have same type: {types}") if isinstance(arrays[0], tuple): arrays = [Series(i) for i in arrays] if axis == 1: idx = arrays[0].index.to_pandas() cols = [] for col in arrays: cols.append(pd.Series(data=col.values.to_ndarray(), index=idx)) retval = pd.concat(cols, axis=1) if labels is not None: retval.columns = pd.Index(labels) else: retval = pd.concat([s.to_pandas() for s in arrays]) return retval
class _LocIndexer: def __init__(self, series): self.series = series def __getitem__(self, key): return self.series[key] def __setitem__(self, key, val): self.series[key] = val class _iLocIndexer: def __init__(self, method_name, series): self.name = method_name self.series = series def validate_key(self, key) -> Union[pdarray, int]: from arkouda.numpy.pdarraycreation import arange, array if isinstance(key, list): key = array(key) if isinstance(key, tuple): raise TypeError(".{} does not support tuple arguments".format(self.name)) if isinstance(key, pdarray): if len(key) == 0: raise ValueError("Cannot index using 0-length iterables.") if key.dtype != int64 and key.dtype != bool: raise TypeError(".{} requires integer keys".format(self.name)) if key.dtype == "bool_" and key.size != self.series.size: raise IndexError( "Boolean index has wrong length: {} instead of {}".format(key.size, self.series.size) ) elif any(key >= self.series.size): raise IndexError("{} cannot enlarge its target object.".format(self.name)) elif isinstance(key, int): if key >= self.series.size: raise IndexError("{} cannot enlarge its target object.".format(self.name)) elif isinstance(key, slice): start = key.start if key.start is not None else 0 stop = key.stop if key.stop is not None else self.series.size stride = key.step if key.step is not None else 1 if start < 0: raise IndexError("Slice start must be non-negative") if stop > self.series.size: raise IndexError("Slice stop must be less than or equal to the size of the Series") if start > stop: raise IndexError("Slice start must be less than or equal to the stop") key = arange(start, stop, stride) else: raise TypeError(".{} requires integer keys".format(self.name)) return key def validate_val(self, val) -> Union[pdarray, supported_scalars]: return self.series.validate_val(val) def __getitem__(self, key): from arkouda.numpy.pdarraycreation import array key = self.validate_key(key) if is_supported_scalar(key): key = array([key]) return Series(index=self.series.index[key], data=self.series.values[key]) def __setitem__(self, key, val): key = self.validate_key(key) val = self.validate_val(val) if is_supported_scalar(val): self.series.values[key] = val return else: if is_supported_scalar(key): self.series.values[key] = val return if key.dtype == int64 and len(val) != len(key): raise ValueError( "cannot set using a list-like indexer with a different length than the value" ) self.series.values[key] = val