Source code for arkouda.array_view

from __future__ import annotations

import json
from enum import Enum

import numpy as np  # type: ignore

from arkouda.client import generic_msg
from arkouda.dtypes import resolve_scalar_dtype, translate_np_dtype
from arkouda.numeric import cast as akcast
from arkouda.numeric import cumprod, where
from arkouda.pdarrayclass import create_pdarray, parse_single_value, pdarray
from arkouda.pdarraycreation import arange, array, ones, zeros
from arkouda.pdarraysetops import concatenate

__all__ = ["ArrayView"]

OrderType = Enum("OrderType", ["ROW_MAJOR", "COLUMN_MAJOR"])


[docs] class ArrayView: """ A multi-dimensional view of a pdarray. Arkouda ``ArraryView`` behaves similarly to numpy's ndarray. The base pdarray is stored in 1-dimension but can be indexed and treated logically as if it were multi-dimensional Attributes ---------- base: pdarray The base pdarray that is being viewed as a multi-dimensional object dtype: dtype The element type of the base pdarray (equivalent to base.dtype) size: int_scalars The number of elements in the base pdarray (equivalent to base.size) shape: pdarray[int] A pdarray specifying the sizes of each dimension of the array ndim: int_scalars Number of dimensions (equivalent to shape.size) itemsize: int_scalars The size in bytes of each element (equivalent to base.itemsize) order: str {'C'/'row_major' | 'F'/'column_major'} Index order to read and write the elements. By default or if 'C'/'row_major', read and write data in row_major order If 'F'/'column_major', read and write data in column_major order """ objType = "ArrayView" def __init__(self, base: pdarray, shape, order="row_major"): self.shape = array(shape) if not isinstance(self.shape, pdarray): raise TypeError(f"ArrayView Shape cannot be type {type(self.shape)}. Expecting pdarray.") if base.size != self.shape.prod(): raise ValueError(f"cannot reshape array of size {base.size} into shape {self.shape}") self.base = base self.size = base.size self.dtype = base.dtype self.ndim = self.shape.size self.itemsize = self.base.itemsize if order.upper() in {"C", "ROW_MAJOR"}: self.order = OrderType.ROW_MAJOR elif order.upper() in {"F", "COLUMN_MAJOR"}: self.order = OrderType.COLUMN_MAJOR else: raise ValueError(f"cannot traverse with order={order}") # cache _reverse_shape which is reversed if we're row_major self._reverse_shape = self.shape if self.order is OrderType.COLUMN_MAJOR else self.shape[::-1] if self.shape.min() == 0: # avoid divide by 0 if any of the dimensions are 0 self._dim_prod = zeros(self.shape.size, self.dtype) else: # cache dim_prod to avoid recalculation, reverse if row_major self._dim_prod = ( cumprod(self.shape) // self.shape if self.order is OrderType.COLUMN_MAJOR else cumprod(self._reverse_shape) // self._reverse_shape ) def __len__(self): return self.size def __repr__(self): from arkouda.client import pdarrayIterThresh if self.size <= pdarrayIterThresh: return self.to_ndarray().__repr__() else: edge_items = np.get_printoptions()["edgeitems"] vals = [f"'{self.base[i]}'" for i in range(edge_items)] vals.append("... ") vals.extend([f"'{self.base[i]}'" for i in range(self.size - edge_items, self.size)]) return f"array([{', '.join(vals)}]), shape {self.shape}" def __str__(self): from arkouda.client import pdarrayIterThresh if self.size <= pdarrayIterThresh: return self.to_ndarray().__str__() else: edge_items = np.get_printoptions()["edgeitems"] vals = [f"'{self.base[i]}'" for i in range(edge_items)] vals.append("... ") vals.extend([f"'{self.base[i]}'" for i in range(self.size - edge_items, self.size)]) return f"[{', '.join(vals)}], shape {self.shape}" def __getitem__(self, key): if isinstance(key, int) or isinstance(key, slice): key = [key] elif isinstance(key, tuple): key = list(key) if len(key) > self.ndim: raise IndexError( f"too many indices for array: array is {self.ndim}-dimensional, " f"but {len(key)} were indexed" ) if len(key) < self.ndim: # append self.ndim-len(key) many ':'s to fill in the missing dimensions for i in range(self.ndim - len(key)): key.append(slice(None, None, None)) try: # attempt to convert to a pdarray (allows for view[0,2,1] instead of view[ak.array([0,2,1])] # but pass on RuntimeError to allow things like # view[0,:,[True,False,True]] to be correctly handled key = array(key) except (RuntimeError, TypeError, ValueError, DeprecationWarning): pass if isinstance(key, pdarray): kind, _ = translate_np_dtype(key.dtype) if kind not in ("int", "uint", "bool"): raise TypeError(f"unsupported pdarray index type {key.dtype}") if kind == "bool": if key.all(): # every dimension is True, so return this arrayview with shape = [1, self.shape] return self.base.reshape( concatenate([ones(1, dtype=self.dtype), self.shape]), order=self.order.name ) else: # at least one dimension is False, # so return empty arrayview with shape = [0, self.shape] return array([], dtype=self.dtype).reshape( concatenate([zeros(1, dtype=self.dtype), self.shape]), order=self.order.name ) # Interpret negative key as offset from end of array key = where(key < 0, akcast(key + self.shape, kind), key) # Capture the indices which are still out of bounds out_of_bounds = (key < 0) | (self.shape <= key) if out_of_bounds.any(): out = arange(key.size)[out_of_bounds][0] raise IndexError( f"index {key[out]} is out of bounds for axis {out} with size {self.shape[out]}" ) coords = key if self.order is OrderType.COLUMN_MAJOR else key[::-1] repMsg = generic_msg( cmd="arrayViewIntIndex", args={ "base": self.base, "dim_prod": self._dim_prod, "coords": coords, }, ) fields = repMsg.split() return parse_single_value(" ".join(fields[1:])) elif isinstance(key, list): indices = [] reshape_dim_list = [] index_dim_list = [] key = key if self.order is OrderType.COLUMN_MAJOR else key[::-1] for i in range(len(key)): x = key[i] if np.isscalar(x) and (resolve_scalar_dtype(x) in ["int64", "uint64"]): orig_key = x if x < 0: # Interpret negative key as offset from end of array x += self._reverse_shape[i] if 0 <= x < self._reverse_shape[i]: indices.append("int") # have to cast to int because JSON doesn't recognize numpy dtypes indices.append(json.dumps(int(x))) index_dim_list.append(1) else: raise IndexError( f"index {orig_key} is out of bounds for axis {i} " f"with size {self._reverse_shape[i]}" ) elif isinstance(x, slice): (start, stop, stride) = x.indices(self._reverse_shape[i]) indices.append("slice") indices.append(json.dumps((start, stop, stride))) slice_size = len(range(*(start, stop, stride))) index_dim_list.append(slice_size) reshape_dim_list.append(slice_size) elif isinstance(x, pdarray) or isinstance(x, list): raise TypeError(f"Advanced indexing is not yet supported {x} ({type(x)})") # x = array(x) # kind, _ = translate_np_dtype(x.dtype) # if kind not in ("bool", "int"): # raise TypeError("unsupported pdarray index type {}".format(x.dtype)) # if kind == "bool" and dim != x.size: # raise ValueError("size mismatch {} {}".format(dim, x.size)) # indices.append('pdarray') # indices.append(x.name) # index_dim_list.append(x.size) # reshape_dim_list.append(x.size) # arrays.append(x) else: raise TypeError(f"Unhandled key type: {x} ({type(x)})") index_dim = array(index_dim_list) repMsg = generic_msg( cmd="arrayViewMixedIndex", args={ "base": self.base, "index_dim": index_dim, "ndim": self.ndim, "dim_prod": self._dim_prod, "coords": indices, }, ) reshape_dim = ( reshape_dim_list if self.order is OrderType.COLUMN_MAJOR else reshape_dim_list[::-1] ) return create_pdarray(repMsg).reshape(reshape_dim, order=self.order.name) else: raise TypeError(f"Unhandled key type: {key} ({type(key)})") def __setitem__(self, key, value): if isinstance(key, int) or isinstance(key, slice): key = [key] elif isinstance(key, tuple): key = list(key) if len(key) > self.ndim: raise IndexError( f"too many indices for array: array is {self.ndim}-dimensional, " f"but {len(key)} were indexed" ) if len(key) < self.ndim: # append self.ndim-len(key) many ':'s to fill in the missing dimensions for i in range(self.ndim - len(key)): key.append(slice(None, None, None)) try: # attempt to convert to a pdarray (allows for view[0,2,1] instead of view[ak.array([0,2,1])] # but pass on RuntimeError to allow things like # view[0,:,[True,False,True]] to be correctly handled key = array(key) except (RuntimeError, TypeError, ValueError, DeprecationWarning): pass if isinstance(key, pdarray): kind, _ = translate_np_dtype(key.dtype) if kind not in ("int", "uint", "bool"): raise TypeError(f"unsupported pdarray index type {key.dtype}") if kind == "bool": if key.all(): # every dimension is True, so fill arrayview with value # if any dimension is False, we don't update anything self.base.fill(value) else: # Interpret negative key as offset from end of array key = where(key < 0, akcast(key + self.shape, kind), key) # Capture the indices which are still out of bounds out_of_bounds = (key < 0) | (self.shape <= key) if out_of_bounds.any(): out = arange(key.size)[out_of_bounds][0] raise IndexError( f"index {key[out]} is out of bounds for axis {out} with size {self.shape[out]}" ) coords = key if self.order is OrderType.COLUMN_MAJOR else key[::-1] generic_msg( cmd="arrayViewIntIndexAssign", args={ "base": self.base, "dtype": self.dtype, "dim_prod": self._dim_prod, "coords": coords, "value": self.base.format_other(value), }, ) elif isinstance(key, list): raise NotImplementedError("Setting via slicing and advanced indexing is not yet supported") else: raise TypeError(f"Unhandled key type: {key} ({type(key)})")
[docs] def to_ndarray(self) -> np.ndarray: """ Convert the ArrayView to a np.ndarray, transferring array data from the Arkouda server to client-side Python. Note: if the ArrayView size exceeds client.maxTransferBytes, a RuntimeError is raised. Returns ------- np.ndarray A numpy ndarray with the same attributes and data as the ArrayView Raises ------ RuntimeError Raised if there is a server-side error thrown, if the ArrayView size exceeds the built-in client.maxTransferBytes size limit, or if the bytes received does not match expected number of bytes Notes ----- The number of bytes in the array cannot exceed ``client.maxTransferBytes``, otherwise a ``RuntimeError`` will be raised. This is to protect the user from overflowing the memory of the system on which the Python client is running, under the assumption that the server is running on a distributed system with much more memory than the client. The user may override this limit by setting client.maxTransferBytes to a larger value, but proceed with caution. See Also -------- array() to_list() Examples -------- >>> a = ak.arange(6).reshape(2,3) >>> a.to_ndarray() array([[0, 1, 2], [3, 4, 5]]) >>> type(a.to_ndarray()) numpy.ndarray """ if self.order is OrderType.ROW_MAJOR: return self.base.to_ndarray().reshape(self.shape.to_ndarray()) else: return self.base.to_ndarray().reshape(self.shape.to_ndarray(), order="F")
[docs] def to_list(self) -> list: """ Convert the ArrayView to a list, transferring array data from the Arkouda server to client-side Python. Note: if the ArrayView size exceeds client.maxTransferBytes, a RuntimeError is raised. Returns ------- list A list with the same data as the ArrayView Raises ------ RuntimeError Raised if there is a server-side error thrown, if the ArrayView size exceeds the built-in client.maxTransferBytes size limit, or if the bytes received does not match expected number of bytes Notes ----- The number of bytes in the array cannot exceed ``client.maxTransferBytes``, otherwise a ``RuntimeError`` will be raised. This is to protect the user from overflowing the memory of the system on which the Python client is running, under the assumption that the server is running on a distributed system with much more memory than the client. The user may override this limit by setting client.maxTransferBytes to a larger value, but proceed with caution. See Also -------- to_ndarray() Examples -------- >>> a = ak.arange(6).reshape(2,3) >>> a.to_list() [[0, 1, 2], [3, 4, 5]] >>> type(a.to_list()) list """ return self.to_ndarray().tolist()
[docs] def to_hdf( self, prefix_path: str, dataset: str = "ArrayView", mode: str = "truncate", file_type: str = "distribute", ): """ Save the current ArrayView object to hdf5 file Parameters ---------- prefix_path: str Path to the file to write the dataset to dataset: str Name of the dataset to write mode: str (truncate | append) Default: truncate Mode to write the dataset in. Truncate will overwrite any existing files. Append will add the dataset to an existing file. file_type: str (single|distribute) Default: distribute Indicates the format to save the file. Single will store in a single file. Distribute will store the date in a file per locale. """ from arkouda.io import _file_type_to_int, _mode_str_to_int generic_msg( cmd="tohdf", args={ "values": self.base, "shape": self.shape, "order": self.order, "filename": prefix_path, "file_format": _file_type_to_int(file_type), "dset": dataset, "write_mode": _mode_str_to_int(mode), "objType": "ArrayView", }, )
[docs] def update_hdf( self, prefix_path: str, dataset: str = "ArrayView", repack: bool = True, ): """ Overwrite the dataset with the name provided with this array view object. If the dataset does not exist it is added. Parameters ----------- prefix_path : str Directory and filename prefix that all output files share dataset : str Name of the dataset to create in files repack: bool Default: True HDF5 does not release memory on delete. When True, the inaccessible data (that was overwritten) is removed. When False, the data remains, but is inaccessible. Setting to false will yield better performance, but will cause file sizes to expand. Returns -------- str - success message if successful Raises ------- RuntimeError Raised if a server-side error is thrown saving the array view Notes ------ - If file does not contain File_Format attribute to indicate how it was saved, the file name is checked for _LOCALE#### to determine if it is distributed. - If the dataset provided does not exist, it will be added - Because HDF5 deletes do not release memory, this will create a copy of the file with the new data """ from arkouda.io import ( _file_type_to_int, _get_hdf_filetype, _mode_str_to_int, _repack_hdf, ) # determine the format (single/distribute) that the file was saved in file_type = _get_hdf_filetype(prefix_path + "*") generic_msg( cmd="tohdf", args={ "values": self.base, "shape": self.shape, "order": self.order, "filename": prefix_path, "file_format": _file_type_to_int(file_type), "dset": dataset, "write_mode": _mode_str_to_int("append"), "objType": "ArrayView", "overwrite": True, }, ) if repack: _repack_hdf(prefix_path)