Source code for arkouda.dataframe

from __future__ import annotations

import json
import os
import random
from collections import UserDict
from typing import Callable, Dict, List, Optional, Tuple, Union, cast
from warnings import warn

import numpy as np  # type: ignore
import pandas as pd  # type: ignore
from numpy import ndarray
from numpy._typing import _8Bit, _16Bit, _32Bit, _64Bit
from typeguard import typechecked

from arkouda import sort as aksort
from arkouda.categorical import Categorical
from arkouda.client import generic_msg, maxTransferBytes
from arkouda.client_dtypes import BitVector, Fields, IPv4
from arkouda.numpy.dtypes import bigint
from arkouda.numpy.dtypes import bool_ as akbool
from arkouda.numpy.dtypes import float64 as akfloat64
from arkouda.numpy.dtypes import int64 as akint64
from arkouda.numpy.dtypes import uint64 as akuint64
from arkouda.groupbyclass import GROUPBY_REDUCTION_TYPES
from arkouda.groupbyclass import GroupBy as akGroupBy
from arkouda.groupbyclass import unique
from arkouda.index import Index, MultiIndex
from arkouda.join import inner_join
from arkouda.numeric import cast as akcast
from arkouda.numeric import cumsum, where
from arkouda.pdarrayclass import RegistrationError, pdarray
from arkouda.pdarraycreation import arange, array, create_pdarray, full, zeros
from arkouda.pdarraysetops import concatenate, in1d, intersect1d
from arkouda.row import Row
from arkouda.segarray import SegArray
from arkouda.series import Series
from arkouda.sorting import argsort, coargsort
from arkouda.strings import Strings
from arkouda.timeclass import Datetime, Timedelta

# This is necessary for displaying DataFrames with BitVector columns,
# because pandas _html_repr automatically truncates the number of displayed bits
pd.set_option("display.max_colwidth", 65)

__all__ = [
    "DataFrame",
    "DataFrameGroupBy",
    "DiffAggregate",
    "intersect",
    "invert_permutation",
    "intx",
    "merge",
]


def apply_if_callable(maybe_callable, obj, **kwargs):
    """
    Evaluate possibly callable input using obj and kwargs if it is callable,
    otherwise return as it is.

    Parameters
    ----------
    maybe_callable : possibly a callable
    obj : NDFrame
    **kwargs
    """
    if callable(maybe_callable):
        return maybe_callable(obj, **kwargs)

    return maybe_callable


def groupby_operators(cls):
    for name in GROUPBY_REDUCTION_TYPES:
        setattr(cls, name, cls._make_aggop(name))
    return cls


[docs] @groupby_operators class DataFrameGroupBy: """ A DataFrame that has been grouped by a subset of columns. Parameters ---------- gb_key_names : str or list(str), default=None The column name(s) associated with the aggregated columns. as_index : bool, default=True If True, interpret aggregated column as index (only implemented for single dimensional aggregates). Otherwise, treat aggregated column as a dataframe column. Attributes ---------- gb : arkouda.groupbyclass.GroupBy GroupBy object, where the aggregation keys are values of column(s) of a dataframe, usually in preparation for aggregating with respect to the other columns. df : arkouda.dataframe.DataFrame The dataframe containing the original data. gb_key_names : str or list(str) The column name(s) associated with the aggregated columns. as_index : bool, default=True If True the grouped values of the aggregation keys will be treated as an index. """ def __init__(self, gb, df, gb_key_names=None, as_index=True): self.gb = gb self.df = df self.gb_key_names = gb_key_names self.as_index = as_index for attr in ["nkeys", "permutation", "unique_keys", "segments"]: setattr(self, attr, getattr(gb, attr)) @classmethod def _make_aggop(cls, opname): numerical_dtypes = [akfloat64, akint64, akuint64] def aggop(self, colnames=None): """ Aggregate the operation, with the grouped column(s) values as keys. Parameters ---------- colnames : (list of) str, default=None Column name or list of column names to compute the aggregation over. Returns ------- arkouda.dataframe.DataFrame """ if colnames is None: colnames = list(self.df.data.keys()) elif isinstance(colnames, str): colnames = [colnames] colnames = [ c for c in colnames if ((self.df.data[c].dtype in numerical_dtypes) or self.df.data[c].dtype == bigint) and ( (isinstance(self.gb_key_names, str) and (c != self.gb_key_names)) or (isinstance(self.gb_key_names, list) and c not in self.gb_key_names) ) ] if isinstance(colnames, List): if isinstance(self.gb_key_names, str): return DataFrame( {c: self.gb.aggregate(self.df.data[c], opname)[1] for c in colnames}, index=Index(self.gb.unique_keys, name=self.gb_key_names), ) elif isinstance(self.gb_key_names, list) and len(self.gb_key_names) == 1: return DataFrame( {c: self.gb.aggregate(self.df.data[c], opname)[1] for c in colnames}, index=Index(self.gb.unique_keys, name=self.gb_key_names[0]), ) elif isinstance(self.gb_key_names, list): column_dict = dict(zip(self.gb_key_names, self.unique_keys)) for c in colnames: column_dict[c] = self.gb.aggregate(self.df.data[c], opname)[1] return DataFrame(column_dict) else: return None return aggop
[docs] def size(self, as_series=None, sort_index=True): """ Compute the size of each value as the total number of rows, including NaN values. Parameters ---------- as_series : bool, default=None Indicates whether to return arkouda.dataframe.DataFrame (if as_series = False) or arkouda.series.Series (if as_series = True) sort_index : bool, default=True If True, results will be returned with index values sorted in ascending order. Returns ------- arkouda.dataframe.DataFrame or arkouda.series.Series Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"A":[1,2,2,3],"B":[3,4,5,6]}) >>> display(df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ | 2 | 2 | 5 | +----+-----+-----+ | 3 | 3 | 6 | +----+-----+-----+ >>> df.groupby("A").size(as_series = False) +----+---------+ | | size | +====+=========+ | 0 | 1 | +----+---------+ | 1 | 2 | +----+---------+ | 2 | 1 | +----+---------+ """ if as_series is True or (as_series is None and self.as_index is True): return self._return_agg_series(self.gb.size(), sort_index=sort_index) else: return self._return_agg_dataframe(self.gb.size(), "size", sort_index=sort_index)
[docs] def head( self, n: int = 5, sort_index: bool = True, ) -> DataFrame: """ Return the first n rows from each group. Parameters ---------- n: int, optional, default = 5 Maximum number of rows to return for each group. If the number of rows in a group is less than n, all the values from that group will be returned. sort_index: bool, default = True If true, return the DataFrame with indices sorted. Returns ------- arkouda.dataframe.DataFrame Examples -------- >>> import arkouda as ak >>> from arkouda import * >>> df = ak.DataFrame({"a":ak.arange(10) %3 , "b":ak.arange(10)}) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | 1 | +----+-----+-----+ | 2 | 2 | 2 | +----+-----+-----+ | 3 | 0 | 3 | +----+-----+-----+ | 4 | 1 | 4 | +----+-----+-----+ | 5 | 2 | 5 | +----+-----+-----+ | 6 | 0 | 6 | +----+-----+-----+ | 7 | 1 | 7 | +----+-----+-----+ | 8 | 2 | 8 | +----+-----+-----+ | 9 | 0 | 9 | +----+-----+-----+ >>> df.groupby("a").head(2) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 0 | 3 | +----+-----+-----+ | 2 | 1 | 1 | +----+-----+-----+ | 3 | 1 | 4 | +----+-----+-----+ | 4 | 2 | 2 | +----+-----+-----+ | 5 | 2 | 5 | +----+-----+-----+ """ _, indx = self.gb.head(self.df.index.values, n=n, return_indices=True) if sort_index: indx = aksort(indx) return self.df[indx]
[docs] def tail( self, n: int = 5, sort_index: bool = True, ) -> DataFrame: """ Return the last n rows from each group. Parameters ---------- n: int, optional, default = 5 Maximum number of rows to return for each group. If the number of rows in a group is less than n, all the rows from that group will be returned. sort_index: bool, default = True If true, return the DataFrame with indices sorted. Returns ------- arkouda.dataframe.DataFrame Examples -------- >>> import arkouda as ak >>> from arkouda import * >>> df = ak.DataFrame({"a":ak.arange(10) %3 , "b":ak.arange(10)}) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | 1 | +----+-----+-----+ | 2 | 2 | 2 | +----+-----+-----+ | 3 | 0 | 3 | +----+-----+-----+ | 4 | 1 | 4 | +----+-----+-----+ | 5 | 2 | 5 | +----+-----+-----+ | 6 | 0 | 6 | +----+-----+-----+ | 7 | 1 | 7 | +----+-----+-----+ | 8 | 2 | 8 | +----+-----+-----+ | 9 | 0 | 9 | +----+-----+-----+ >>> df.groupby("a").tail(2) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 6 | +----+-----+-----+ | 1 | 0 | 9 | +----+-----+-----+ | 2 | 1 | 4 | +----+-----+-----+ | 3 | 1 | 7 | +----+-----+-----+ | 4 | 2 | 5 | +----+-----+-----+ | 5 | 2 | 8 | +----+-----+-----+ """ _, indx = self.gb.tail(self.df.index.values, n=n, return_indices=True) if sort_index: indx = aksort(indx) return self.df[indx]
[docs] def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None): """ Return a random sample from each group. You can either specify the number of elements or the fraction of elements to be sampled. random_state can be used for reproducibility Parameters ---------- n: int, optional Number of items to return for each group. Cannot be used with frac and must be no larger than the smallest group unless replace is True. Default is one if frac is None. frac: float, optional Fraction of items to return. Cannot be used with n. replace: bool, default False Allow or disallow sampling of the same row more than once. weights: pdarray, optional Default None results in equal probability weighting. If passed a pdarray, then values must have the same length as the underlying DataFrame and will be used as sampling probabilities after normalization within each group. Weights must be non-negative with at least one positive element within each group. random_state: int or ak.random.Generator, optional If int, seed for random number generator. If ak.random.Generator, use as given. Returns ------- DataFrame A new DataFrame containing items randomly sampled from each group sorted according to the grouped columns. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"A":[3,1,2,1,2,3],"B":[3,4,5,6,7,8]}) >>> display(df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 3 | 3 | +----+-----+-----+ | 1 | 1 | 4 | +----+-----+-----+ | 2 | 2 | 5 | +----+-----+-----+ | 3 | 1 | 6 | +----+-----+-----+ | 4 | 2 | 7 | +----+-----+-----+ | 5 | 3 | 8 | +----+-----+-----+ >>> df.groupby("A").sample(random_state=6) +----+-----+-----+ | | A | B | +====+=====+=====+ | 3 | 1 | 6 | +----+-----+-----+ | 4 | 2 | 7 | +----+-----+-----+ | 5 | 3 | 8 | +----+-----+-----+ >>> df.groupby("A").sample(frac=0.5, random_state=3, weights=ak.array([1,1,1,0,0,0])) +----+-----+-----+ | | A | B | +====+=====+=====+ | 1 | 1 | 4 | +----+-----+-----+ | 2 | 2 | 5 | +----+-----+-----+ | 0 | 3 | 3 | +----+-----+-----+ >>> df.groupby("A").sample(n=3, replace=True, random_state=ak.random.default_rng(7)) +----+-----+-----+ | | A | B | +====+=====+=====+ | 1 | 1 | 4 | +----+-----+-----+ | 3 | 1 | 6 | +----+-----+-----+ | 1 | 1 | 4 | +----+-----+-----+ | 4 | 2 | 7 | +----+-----+-----+ | 4 | 2 | 7 | +----+-----+-----+ | 4 | 2 | 7 | +----+-----+-----+ | 0 | 3 | 3 | +----+-----+-----+ | 5 | 3 | 8 | +----+-----+-----+ | 5 | 3 | 8 | +----+-----+-----+ """ return self.df[ self.gb.sample( values=self.df.index.values, n=n, frac=frac, replace=replace, weights=weights, random_state=random_state, return_indices=True, permute_samples=True, ) ]
def _return_agg_series(self, values, sort_index=True): if self.as_index is True: if isinstance(self.gb_key_names, str): # handle when values is a tuple/list containing data and index # since we are also sending the index keyword if isinstance(values, (Tuple, List)) and len(values) == 2: _, values = values series = Series(values, index=Index(self.gb.unique_keys, name=self.gb_key_names)) elif isinstance(self.gb_key_names, list) and len(self.gb_key_names) == 1: # handle when values is a tuple/list containing data and index # since we are also sending the index keyword if isinstance(values, (Tuple, List)) and len(values) == 2: _, values = values series = Series(values, index=Index(self.gb.unique_keys, name=self.gb_key_names[0])) elif isinstance(self.gb_key_names, list) and len(self.gb_key_names) > 1: from arkouda.index import MultiIndex # handle when values is a tuple/list containing data and index # since we are also sending the index keyword if isinstance(values, (Tuple, List)) and len(values) == 2: _, values = values series = Series( values, index=MultiIndex(self.gb.unique_keys, names=self.gb_key_names), ) else: series = Series(values) if sort_index is True: series = series.sort_index() return series def _return_agg_dataframe(self, values, name, sort_index=True): if isinstance(self.gb_key_names, str): if self.as_index is True: df = DataFrame( {name: values[1]}, index=Index(self.gb.unique_keys, name=self.gb_key_names), ) else: df = DataFrame({self.gb_key_names: self.gb.unique_keys, name: values[1]}) if sort_index is True: df = df.sort_index() return df elif len(self.gb_key_names) == 1: if self.as_index is True: df = DataFrame( {name: values[1]}, index=Index(self.gb.unique_keys, name=self.gb_key_names[0]), ) else: df = DataFrame( {self.gb_key_names[0]: self.gb.unique_keys, name: values[1]}, ) if sort_index is True: df = df.sort_index() return df else: return Series(values).to_dataframe(index_labels=self.gb_key_names, value_label=name)
[docs] def diff(self, colname): """ Create a difference aggregate for the given column. For each group, the difference between successive values is calculated. Aggregate operations (mean,min,max,std,var) can be done on the results. Parameters ---------- colname: str Name of the column to compute the difference on. Returns ------- DiffAggregate Object containing the differences, which can be aggregated. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"A":[1,2,2,2,3,3],"B":[3,9,11,27,86,100]}) >>> display(df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 9 | +----+-----+-----+ | 2 | 2 | 11 | +----+-----+-----+ | 3 | 2 | 27 | +----+-----+-----+ | 4 | 3 | 86 | +----+-----+-----+ | 5 | 3 | 100 | +----+-----+-----+ >>> gb = df.groupby("A") >>> gb.diff("B").values array([nan nan 2.00000000000000000 16.00000000000000000 nan 14.00000000000000000]) """ return DiffAggregate(self.gb, self.df.data[colname])
[docs] def broadcast(self, x, permute=True): """ Fill each group’s segment with a constant value. Parameters ---------- x : Series or pdarray The values to put in each group’s segment. permute : bool, default=True If True (default), permute broadcast values back to the ordering of the original array on which GroupBy was called. If False, the broadcast values are grouped by value. Returns ------- arkouda.series.Series A Series with the Index of the original frame and the values of the broadcast. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> from arkouda.dataframe import DataFrameGroupBy >>> df = ak.DataFrame({"A":[1,2,2,3],"B":[3,4,5,6]}) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ | 2 | 2 | 5 | +----+-----+-----+ | 3 | 3 | 6 | +----+-----+-----+ >>> gb = df.groupby("A") >>> x = ak.array([10,11,12]) >>> s = DataFrameGroupBy.broadcast(gb, x) >>> df["C"] = s.values >>> display(df) +----+-----+-----+-----+ | | A | B | C | +====+=====+=====+=====+ | 0 | 1 | 3 | 10 | +----+-----+-----+-----+ | 1 | 2 | 4 | 11 | +----+-----+-----+-----+ | 2 | 2 | 5 | 11 | +----+-----+-----+-----+ | 3 | 3 | 6 | 12 | +----+-----+-----+-----+ """ if isinstance(x, Series): data = self.gb.broadcast(x.values, permute=permute) else: data = self.gb.broadcast(x, permute=permute) return Series(data=data, index=self.df.index)
[docs] @groupby_operators class DiffAggregate: """ A column in a GroupBy that has been differenced. Aggregation operations can be done on the result. Attributes ---------- gb : arkouda.groupbyclass.GroupBy GroupBy object, where the aggregation keys are values of column(s) of a dataframe. values : arkouda.series.Series. A column to compute the difference on. """ def __init__(self, gb, series): self.gb = gb values = zeros(len(series), "float64") series_permuted = series[gb.permutation] values[1:] = akcast(series_permuted[1:] - series_permuted[:-1], "float64") values[gb.segments] = np.nan self.values = values @classmethod def _make_aggop(cls, opname): def aggop(self): return Series(self.gb.aggregate(self.values, opname)) return aggop
""" DataFrame structure based on Arkouda arrays. """
[docs] class DataFrame(UserDict): """ A DataFrame structure based on arkouda arrays. Parameters ---------- initialdata : List or dictionary of lists, tuples, or pdarrays Each list/dictionary entry corresponds to one column of the data and should be a homogenous type. Different columns may have different types. If using a dictionary, keys should be strings. index : Index, pdarray, or Strings Index for the resulting frame. Defaults to an integer range. columns : List, tuple, pdarray, or Strings Column labels to use if the data does not include them. Elements must be strings. Defaults to an stringified integer range. Examples -------- Create an empty DataFrame and add a column of data: >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame() >>> df['a'] = ak.array([1,2,3]) >>> display(df) +----+-----+ | | a | +====+=====+ | 0 | 1 | +----+-----+ | 1 | 2 | +----+-----+ | 2 | 3 | +----+-----+ Create a new DataFrame using a dictionary of data: >>> userName = ak.array(['Alice', 'Bob', 'Alice', 'Carol', 'Bob', 'Alice']) >>> userID = ak.array([111, 222, 111, 333, 222, 111]) >>> item = ak.array([0, 0, 1, 1, 2, 0]) >>> day = ak.array([5, 5, 6, 5, 6, 6]) >>> amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) >>> df = ak.DataFrame({'userName': userName, 'userID': userID, >>> 'item': item, 'day': day, 'amount': amount}) >>> display(df) +----+------------+----------+--------+-------+----------+ | | userName | userID | item | day | amount | +====+============+==========+========+=======+==========+ | 0 | Alice | 111 | 0 | 5 | 0.5 | +----+------------+----------+--------+-------+----------+ | 1 | Bob | 222 | 0 | 5 | 0.6 | +----+------------+----------+--------+-------+----------+ | 2 | Alice | 111 | 1 | 6 | 1.1 | +----+------------+----------+--------+-------+----------+ | 3 | Carol | 333 | 1 | 5 | 1.2 | +----+------------+----------+--------+-------+----------+ | 4 | Bob | 222 | 2 | 6 | 4.3 | +----+------------+----------+--------+-------+----------+ | 5 | Alice | 111 | 0 | 6 | 0.6 | +----+------------+----------+--------+-------+----------+ Indexing works slightly differently than with pandas: >>> df[0] +------------+----------+ | keys | values | +============+==========+ | userName | Alice | +------------+----------+ |userID | 111 | +------------+----------+ | item | 0 | +------------+----------+ | day | 5 | +------------+----------+ | amount | 0.5 | +------------+----------+ >>> df['userID'] array([111, 222, 111, 333, 222, 111]) >>> df['userName'] array(['Alice', 'Bob', 'Alice', 'Carol', 'Bob', 'Alice']) >>> df[ak.array([1,3,5])] +----+------------+----------+--------+-------+----------+ | | userName | userID | item | day | amount | +====+============+==========+========+=======+==========+ | 0 | Bob | 222 | 0 | 5 | 0.6 | +----+------------+----------+--------+-------+----------+ | 1 | Carol | 333 | 1 | 5 | 1.2 | +----+------------+----------+--------+-------+----------+ | 2 | Alice | 111 | 0 | 6 | 0.6 | +----+------------+----------+--------+-------+----------+ Compute the stride: >>> df[1:5:1] +----+------------+----------+--------+-------+----------+ | | userName | userID | item | day | amount | +====+============+==========+========+=======+==========+ | 0 | Bob | 222 | 0 | 5 | 0.6 | +----+------------+----------+--------+-------+----------+ | 1 | Alice | 111 | 1 | 6 | 1.1 | +----+------------+----------+--------+-------+----------+ | 2 | Carol | 333 | 1 | 5 | 1.2 | +----+------------+----------+--------+-------+----------+ | 3 | Bob | 222 | 2 | 6 | 4.3 | +----+------------+----------+--------+-------+----------+ >>> df[ak.array([1,2,3])] +----+------------+----------+--------+-------+----------+ | | userName | userID | item | day | amount | +====+============+==========+========+=======+==========+ | 0 | Bob | 222 | 0 | 5 | 0.6 | +----+------------+----------+--------+-------+----------+ | 1 | Alice | 111 | 1 | 6 | 1.1 | +----+------------+----------+--------+-------+----------+ | 2 | Carol | 333 | 1 | 5 | 1.2 | +----+------------+----------+--------+-------+----------+ >>> df[['userID', 'day']] +----+----------+-------+ | | userID | day | +====+==========+=======+ | 0 | 111 | 5 | +----+----------+-------+ | 1 | 222 | 5 | +----+----------+-------+ | 2 | 111 | 6 | +----+----------+-------+ | 3 | 333 | 5 | +----+----------+-------+ | 4 | 222 | 6 | +----+----------+-------+ | 5 | 111 | 6 | +----+----------+-------+ """ _COLUMN_CLASSES = (pdarray, Strings, Categorical, SegArray) objType = "DataFrame" def __init__(self, initialdata=None, index=None, columns=None): super().__init__() self.registered_name = None if isinstance(initialdata, DataFrame): # Copy constructor self._nrows = initialdata._nrows self._bytes = initialdata._bytes self._empty = initialdata._empty self._columns = initialdata._columns if index is None: self._set_index(initialdata.index) else: self._set_index(index) self.data = initialdata.data self.update_nrows() return elif isinstance(initialdata, pd.DataFrame): # copy pd.DataFrame data into the ak.DataFrame object self._nrows = initialdata.shape[0] self._bytes = 0 self._empty = initialdata.empty self._columns = initialdata.columns.tolist() if index is None: self._set_index(initialdata.index) else: self._set_index(index) self.data = {} for key in initialdata.columns: if hasattr(initialdata[key], "values") and isinstance( initialdata[key].values[0], (list, np.ndarray) ): self.data[key] = SegArray.from_multi_array([array(r) for r in initialdata[key]]) elif hasattr(initialdata[key], "values") and isinstance( initialdata[key].values, pd.Categorical ): self.data[key] = Categorical(initialdata[key].values) else: self.data[key] = array(initialdata[key]) self.data.update() return # Some metadata about this dataframe. self._nrows = 0 self._bytes = 0 self._empty = True # Initial attempts to keep an order on the columns self._columns = [] self._set_index(index) # Add data to the DataFrame if there is any if initialdata is not None: # Used to prevent uneven array length in initialization. sizes = set() # Initial data is a dictionary of arkouda arrays if isinstance(initialdata, dict): for key, val in initialdata.items(): if isinstance(val, (list, tuple)): val = array(val) if not isinstance(val, self._COLUMN_CLASSES): raise ValueError(f"Values must be one of {self._COLUMN_CLASSES}.") if key.lower() == "index": # handles the index as an Index object instead of a column self._set_index(val) continue sizes.add(val.size) if len(sizes) > 1: raise ValueError("Input arrays must have equal size.") self._empty = False self[key] = val # Initial data is a list of arkouda arrays elif isinstance(initialdata, list): # Create string IDs for the columns keys = [] if columns is not None: if any(not isinstance(label, str) for label in columns): raise TypeError("Column labels must be strings.") if len(columns) != len(initialdata): raise ValueError("Must have as many labels as columns") keys = columns else: keys = [str(x) for x in range(len(initialdata))] for key, col in zip(keys, initialdata): if isinstance(col, (list, tuple)): col = array(col) if not isinstance(col, self._COLUMN_CLASSES): raise ValueError(f"Values must be one of {self._COLUMN_CLASSES}.") sizes.add(col.size) if len(sizes) > 1: raise ValueError("Input arrays must have equal size.") self._empty = False self[key] = col # Initial data is invalid. else: raise ValueError(f"Initialize with dict or list of {self._COLUMN_CLASSES}.") # Update the dataframe indices and metadata. if len(sizes) > 0: self._nrows = sizes.pop() # If the index param was passed in, use that instead of # creating a new one. if self.index is None: self._set_index(arange(self._nrows)) else: self._set_index(index) self.update_nrows() def __getattr__(self, key): if key not in self.columns.values: raise AttributeError(f"Attribute {key} not found") # Should this be cached? return Series(data=self[key], index=self.index.index) def __dir__(self): return dir(DataFrame) + self.columns.values + ["columns"] # delete a column def __delitem__(self, key): # This function is a backdoor to messing up the indices and columns. # I needed to reimplement it to prevent bad behavior UserDict.__delitem__(self, key) self._columns.remove(key) # If removing this column emptied the dataframe if len(self._columns) == 0: self._set_index(None) self._empty = True self.update_nrows() def __getitem__(self, key): # convert series to underlying values # Should check for index alignment if isinstance(key, Series): key = key.values # Select rows using an integer pdarray if isinstance(key, pdarray): if key.dtype == akbool: key = arange(key.size)[key] result = {} for k in self._columns: result[k] = UserDict.__getitem__(self, k)[key] # To stay consistent with numpy, provide the old index values return DataFrame(initialdata=result, index=self.index.index[key]) # Select rows or columns using a list if isinstance(key, (list, tuple)): result = DataFrame() if len(key) <= 0: return result if len({type(x) for x in key}) > 1: raise TypeError("Invalid selector: too many types in list.") if isinstance(key[0], str): for k in key: result[k] = self[k] result._empty = False result._set_index(self.index) # column lens remain the same. Copy the indexing return result else: raise TypeError( "DataFrames only support lists for column indexing. " "All list entries must be of type str." ) # Select a single row using an integer if isinstance(key, int): result = {} row = array([key]) for k in self._columns: result[k] = (UserDict.__getitem__(self, k)[row])[0] return Row(result) # Select a single column using a string elif isinstance(key, str): if key not in self.keys(): raise KeyError(f"Invalid column name '{key}'.") return UserDict.__getitem__(self, key) # Select rows using a slice elif isinstance(key, slice): # result = DataFrame() rtn_data = {} s = key for k in self._columns: rtn_data[k] = UserDict.__getitem__(self, k)[s] return DataFrame(initialdata=rtn_data, index=self.index.index[arange(self._nrows)[s]]) else: raise IndexError("Invalid selector: unknown error.") def __setitem__(self, key, value): self.update_nrows() # If this is the first column added, we must create an index column. add_index = False if self._empty: add_index = True # Set a single row in the dataframe using a dict of values if isinstance(key, int): for k in self._columns: if isinstance(self.data[k], Strings): raise ValueError( "This DataFrame has a column of type ak.Strings;" " so this DataFrame is immutable. This feature could change" " if arkouda supports mutable Strings in the future." ) if self._empty: raise ValueError("Initial data must be dict of arkouda arrays.") elif not isinstance(value, (dict, UserDict)): raise ValueError("Expected dict or Row type.") elif key >= self._nrows: raise KeyError("The row index is out of range.") else: for k, v in value.items(): # maintaining to prevent adding index column if k == "index": continue self[k][key] = v # Set a single column in the dataframe using a an arkouda array elif isinstance(key, str): if isinstance(value, Series): value = value.values if not isinstance(value, self._COLUMN_CLASSES): raise ValueError(f"Column must be one of {self._COLUMN_CLASSES}.") elif self._nrows is not None and self._nrows != value.size: raise ValueError(f"Expected size {self._nrows} but received size {value.size}.") else: self._empty = False UserDict.__setitem__(self, key, value) # Update the index values if key not in self._columns: self._columns.append(key) # Do nothing and return if there's no valid data else: raise ValueError("No valid data received.") # Update the dataframe indices and metadata. if add_index: self.update_nrows() self._set_index(arange(self._nrows)) def __len__(self): """ Return the number of rows. """ return self._nrows def _ncols(self): """ Number of columns. If index appears, we now want to utilize this because the actual index has been moved to a property """ return len(self._columns) def __str__(self): """ Returns a summary string of this dataframe. """ self.update_nrows() if self._empty: return "DataFrame([ -- ][ 0 rows : 0 B])" keys = [str(key) for key in list(self._columns)] keys = [("'" + key + "'") for key in keys] keystr = ", ".join(keys) # first call to memory_usage_info() initializes self._bytes mem = self.memory_usage_info() # Get units that make the most sense. if self._bytes < 1024: mem = self.memory_usage_info(unit="B") elif self._bytes < 1024**2: mem = self.memory_usage_info(unit="KB") elif self._bytes < 1024**3: mem = self.memory_usage_info(unit="MB") else: mem = self.memory_usage_info(unit="GB") rows = " rows" if self._nrows == 1: rows = " row" return "DataFrame([" + keystr + "], {:,}".format(self._nrows) + rows + ", " + str(mem) + ")" def _get_head_tail(self): if self._empty: return pd.DataFrame() self.update_nrows() maxrows = pd.get_option("display.max_rows") if self._nrows <= maxrows: newdf = DataFrame() for col in self._columns: if isinstance(self[col], Categorical): newdf[col] = self[col].categories[self[col].codes] else: newdf[col] = self[col] newdf._set_index(self.index) return newdf.to_pandas(retain_index=True) # Being 1 above the threshold causes the PANDAS formatter to split the data frame vertically idx = array( list(range(maxrows // 2 + 1)) + list(range(self._nrows - (maxrows // 2), self._nrows)) ) newdf = DataFrame() for col in self._columns: if isinstance(self[col], Categorical): newdf[col] = self[col].categories[self[col].codes[idx]] else: newdf[col] = self[col][idx] newdf._set_index(self.index.index[idx]) return newdf.to_pandas(retain_index=True) def _get_head_tail_server(self): if self._empty: return pd.DataFrame() self.update_nrows() maxrows = pd.get_option("display.max_rows") if self._nrows <= maxrows: newdf = DataFrame() for col in self._columns: if isinstance(self[col], Categorical): newdf[col] = self[col].categories[self[col].codes] else: newdf[col] = self[col] newdf._set_index(self.index) return newdf.to_pandas(retain_index=True) # Being 1 above the threshold causes the PANDAS formatter to split the data frame vertically idx = array( list(range(maxrows // 2 + 1)) + list(range(self._nrows - (maxrows // 2), self._nrows)) ) msg_list = [] for col in self._columns: if isinstance(self[col], Categorical): msg_list.append(f"Categorical+{col}+{self[col].codes.name}+{self[col].categories.name}") elif isinstance(self[col], SegArray): msg_list.append(f"SegArray+{col}+{self[col].segments.name}+{self[col].values.name}") elif isinstance(self[col], Strings): msg_list.append(f"Strings+{col}+{self[col].name}") elif isinstance(self[col], Fields): msg_list.append(f"Fields+{col}+{self[col].name}") elif isinstance(self[col], IPv4): msg_list.append(f"IPv4+{col}+{self[col].name}") elif isinstance(self[col], Datetime): msg_list.append(f"Datetime+{col}+{self[col].name}") elif isinstance(self[col], BitVector): msg_list.append(f"BitVector+{col}+{self[col].name}") else: msg_list.append(f"pdarray+{col}+{self[col].name}") repMsg = cast( str, generic_msg( cmd="dataframe_idx", args={ "size": len(msg_list), "idx_name": idx.name, "columns": msg_list, }, ), ) msgList = json.loads(repMsg) df_dict = {} for m in msgList: # Split to [datatype, column, create] msg = m.split("+", 2) t = msg[0] if t == "Strings": # Categorical is returned as a strings by indexing categories[codes[idx]] df_dict[msg[1]] = Strings.from_return_msg(msg[2]) elif t == "SegArray": # split creates for segments and values eles = msg[2].split("+") df_dict[msg[1]] = SegArray(create_pdarray(eles[0]), create_pdarray(eles[1])) elif t == "Fields": df_dict[msg[1]] = Fields( create_pdarray(msg[2]), self[msg[1]].names, MSB_left=self[msg[1]].MSB_left, pad=self[msg[1]].padchar, separator=self[msg[1]].separator, show_int=self[msg[1]].show_int, ) elif t == "IPv4": df_dict[msg[1]] = IPv4(create_pdarray(msg[2])) elif t == "Datetime": df_dict[msg[1]] = Datetime(create_pdarray(msg[2])) elif t == "BitVector": df_dict[msg[1]] = BitVector( create_pdarray(msg[2]), width=self[msg[1]].width, reverse=self[msg[1]].reverse, ) else: df_dict[msg[1]] = create_pdarray(msg[2]) new_df = DataFrame(df_dict) new_df._set_index(self.index.index[idx]) return new_df.to_pandas(retain_index=True)[self._columns]
[docs] def transfer(self, hostname, port): """ Sends a DataFrame to a different Arkouda server. Parameters ---------- hostname : str The hostname where the Arkouda server intended to receive the DataFrame is running. port : int_scalars The port to send the array over. This needs to be an open port (i.e., not one that the Arkouda server is running on). This will open up `numLocales` ports, each of which in succession, so will use ports of the range {port..(port+numLocales)} (e.g., running an Arkouda server of 4 nodes, port 1234 is passed as `port`, Arkouda will use ports 1234, 1235, 1236, and 1237 to send the array data). This port much match the port passed to the call to `ak.receive_array()`. Returns ------- str A message indicating a complete transfer. Raises ------ ValueError Raised if the op is not within the pdarray.BinOps set TypeError Raised if other is not a pdarray or the pdarray.dtype is not a supported dtype """ self.update_nrows() idx = self._index msg_list = [] for col in self._columns: if isinstance(self[col], Categorical): msg_list.append( f"Categorical+{col}+{self[col].codes.name} \ +{self[col].categories.name}+{self[col]._akNAcode.name}" ) elif isinstance(self[col], SegArray): msg_list.append(f"SegArray+{col}+{self[col].segments.name}+{self[col].values.name}") elif isinstance(self[col], Strings): msg_list.append(f"Strings+{col}+{self[col].name}") elif isinstance(self[col], Fields): msg_list.append(f"Fields+{col}+{self[col].name}") elif isinstance(self[col], IPv4): msg_list.append(f"IPv4+{col}+{self[col].name}") elif isinstance(self[col], Datetime): msg_list.append(f"Datetime+{col}+{self[col].name}") elif isinstance(self[col], BitVector): msg_list.append(f"BitVector+{col}+{self[col].name}") else: msg_list.append(f"pdarray+{col}+{self[col].name}") repMsg = cast( str, generic_msg( cmd="sendDataframe", args={ "size": len(msg_list), "idx_name": idx.name, "columns": msg_list, "hostname": hostname, "port": port, }, ), ) return repMsg
def _shape_str(self): return f"{self._nrows} rows x {self._ncols()} columns" def __repr__(self): """ Return ascii-formatted version of the dataframe. """ prt = self._get_head_tail_server() with pd.option_context("display.show_dimensions", False): retval = prt.__repr__() retval += " (" + self._shape_str() + ")" return retval def _repr_html_(self): """ Return html-formatted version of the dataframe. """ prt = self._get_head_tail_server() with pd.option_context("display.show_dimensions", False): retval = prt._repr_html_() retval += "<p>" + self._shape_str() + "</p>" return retval def _ipython_key_completions_(self): return self._columns
[docs] @classmethod def from_pandas(cls, pd_df): """ Copy the data from a pandas DataFrame into a new arkouda.dataframe.DataFrame. Parameters ---------- pd_df : pandas.DataFrame A pandas DataFrame to convert. Returns ------- arkouda.dataframe.DataFrame Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import pandas as pd >>> pd_df = pd.DataFrame({"A":[1,2],"B":[3,4]}) >>> type(pd_df) pandas.core.frame.DataFrame >>> display(pd_df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ >>> ak_df = DataFrame.from_pandas(pd_df) >>> type(ak_df) arkouda.dataframe.DataFrame >>> display(ak_df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ """ return DataFrame(initialdata=pd_df)
def _drop_column(self, keys): """ Drop a column or columns from the dataframe, in-place. keys : list The labels to be dropped on the given axis """ for key in keys: # This will raise an exception if key does not exist # Use self.pop(key, None) if we do not want to error del self[key] def _drop_row(self, keys): """ Drop a row or rows from the dataframe, in-place. keys : list The indexes to be dropped on the given axis """ idx_list = [] last_idx = -1 # sort to ensure we go in ascending order. keys.sort() for k in keys: if not isinstance(k, int): raise TypeError("Index keys must be integers.") idx_list.append(self.index.index[(last_idx + 1) : k]) last_idx = k idx_list.append(self.index.index[(last_idx + 1) :]) idx_to_keep = concatenate(idx_list) for key in self.keys(): # using the UserDict.__setitem__ here because we know all the columns are being # reset to the same size # This avoids the size checks we would do when only setting a single column UserDict.__setitem__(self, key, self[key][idx_to_keep]) self._set_index(idx_to_keep)
[docs] @typechecked def drop( self, keys: Union[str, int, List[Union[str, int]]], axis: Union[str, int] = 0, inplace: bool = False, ) -> Union[None, DataFrame]: """ Drop column/s or row/s from the dataframe. Parameters ---------- keys : str, int or list The labels to be dropped on the given axis. axis : int or str The axis on which to drop from. 0/'index' - drop rows, 1/'columns' - drop columns. inplace: bool, default=False When True, perform the operation on the calling object. When False, return a new object. Returns ------- arkouda.dataframe.DataFrame or None DateFrame when `inplace=False`; None when `inplace=True` Examples ---------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ Drop column >>> df.drop('col1', axis = 1) +----+--------+ | | col2 | +====+========+ | 0 | 3 | +----+--------+ | 1 | 4 | +----+--------+ Drop row >>> df.drop(0, axis = 0) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 2 | 4 | +----+--------+--------+ """ if isinstance(keys, str) or isinstance(keys, int): keys = [keys] obj = self if inplace else self.copy() if axis == 0 or axis == "index": # drop a row obj._drop_row(keys) elif axis == 1 or axis == "columns": # drop column obj._drop_column(keys) else: raise ValueError(f"No axis named {axis} for object type DataFrame") # If the dataframe just became empty... if len(obj._columns) == 0: obj._set_index(None) obj._empty = True obj.update_nrows() if not inplace: return obj return None
[docs] def drop_duplicates(self, subset=None, keep="first"): """ Drops duplcated rows and returns resulting DataFrame. If a subset of the columns are provided then only one instance of each duplicated row will be returned (keep determines which row). Parameters ---------- subset : Iterable Iterable of column names to use to dedupe. keep : {'first', 'last'}, default='first' Determines which duplicates (if any) to keep. Returns ------- arkouda.dataframe.DataFrame DataFrame with duplicates removed. Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 2, 3], 'col2': [4, 5, 5, 6]}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 2 | 5 | +----+--------+--------+ | 2 | 2 | 5 | +----+--------+--------+ | 3 | 3 | 6 | +----+--------+--------+ >>> df.drop_duplicates() +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 2 | 5 | +----+--------+--------+ | 2 | 3 | 6 | +----+--------+--------+ """ if self._empty: return self if not subset: subset = self._columns if len(subset) == 1: if not subset[0] in self.data: raise KeyError(f"{subset[0]} is not a column in the DataFrame.") gp = akGroupBy(self.data[subset[0]]) else: for col in subset: if col not in self.data: raise KeyError(f"{subset[0]} is not a column in the DataFrame.") gp = akGroupBy([self.data[col] for col in subset]) if keep == "last": _segment_ends = concatenate([gp.segments[1:] - 1, array([gp.permutation.size - 1])]) return self[gp.permutation[_segment_ends]] else: return self[gp.permutation[gp.segments]]
@property def size(self): """ Returns the number of bytes on the arkouda server. Returns ------- int The number of bytes on the arkouda server. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 2 | 5 | +----+--------+--------+ | 2 | 3 | 6 | +----+--------+--------+ >>> df.size 6 """ self.update_nrows() if self._nrows is None: return 0 return self.shape[0] * self.shape[1] @property def dtypes(self): """ The dtypes of the dataframe. Returns ------- dtypes : arkouda.row.Row The dtypes of the dataframe. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2], 'col2': ["a", "b"]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | a | +----+--------+--------+ | 1 | 2 | b | +----+--------+--------+ >>> df.dtypes +----+--------+ |keys| values | +====+========+ |col1| int64 | +----+--------+ |col2| str | +----+--------+ """ dtypes = [] keys = [] for key, val in self.items(): keys.append(key) if isinstance(val, pdarray): dtypes.append(str(val.dtype)) elif isinstance(val, Strings): dtypes.append("str") elif isinstance(val, Categorical): dtypes.append("Categorical") elif isinstance(val, SegArray): dtypes.append("SegArray") else: raise TypeError(f"Unsupported type encountered for ak.DataFrame, {type(val)}") res = Row({key: dtype for key, dtype in zip(keys, dtypes)}) return res @property def empty(self): """ Whether the dataframe is empty. Returns ------- bool True if the dataframe is empty, otherwise False. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({}) >>> df 0 rows x 0 columns >>> df.empty True """ return self._empty @property def shape(self): """ The shape of the dataframe. Returns ------- tuple of int Tuple of array dimensions. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 2 | 5 | +----+--------+--------+ | 2 | 3 | 6 | +----+--------+--------+ >>> df.shape (3, 2) """ self.update_nrows() num_cols = len(self._columns) nrows = self._nrows return (nrows, num_cols) @property def columns(self): """ An Index where the values are the column names of the dataframe. Returns ------- arkouda.index.Index The values of the index are the column names of the dataframe. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ >>> df.columns Index(array(['col1', 'col2']), dtype='<U0') """ if isinstance(self._columns, ndarray): column_names = self._columns.tolist() else: column_names = self._columns return Index(column_names, allow_list=True) @property def index(self): """ The index of the dataframe. Returns ------- arkouda.index.Index or arkouda.index.MultiIndex The index of the dataframe. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ >>> df.index Index(array([0 1]), dtype='int64') """ return self._index def _set_index(self, value): if isinstance(value, Index) or value is None: self._index = value elif isinstance(value, (pdarray, Strings, pd.Index)): self._index = Index(value) elif isinstance(value, list): self._index = Index(array(value)) else: raise TypeError( f"DataFrame Index can only be constructed from type ak.Index, pdarray or list." f" {type(value)} provided." )
[docs] @typechecked def reset_index(self, size: Optional[int] = None, inplace: bool = False) -> Union[None, DataFrame]: """ Set the index to an integer range. Useful if this dataframe is the result of a slice operation from another dataframe, or if you have permuted the rows and no longer need to keep that ordering on the rows. Parameters ---------- size : int, optional If size is passed, do not attempt to determine size based on existing column sizes. Assume caller handles consistency correctly. inplace: bool, default=False When True, perform the operation on the calling object. When False, return a new object. Returns ------- arkouda.dataframe.DataFrame or None DateFrame when `inplace=False`; None when `inplace=True`. NOTE ---------- Pandas adds a column 'index' to indicate the original index. Arkouda does not currently support this behavior. Example ------- >>> df = ak.DataFrame({"A": ak.array([1, 2, 3]), "B": ak.array([4, 5, 6])}) >>> display(df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 2 | 5 | +----+-----+-----+ | 2 | 3 | 6 | +----+-----+-----+ >>> perm_df = df[ak.array([0,2,1])] >>> display(perm_df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 3 | 6 | +----+-----+-----+ | 2 | 2 | 5 | +----+-----+-----+ >>> perm_df.reset_index() +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 3 | 6 | +----+-----+-----+ | 2 | 2 | 5 | +----+-----+-----+ """ obj = self if inplace else self.copy() if not size: obj.update_nrows() obj._set_index(arange(obj._nrows)) else: obj._set_index(arange(size)) if not inplace: return obj return None
@property def info(self): """ Returns a summary string of this dataframe. Returns ------- str A summary string of this dataframe. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2], 'col2': ["a", "b"]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | a | +----+--------+--------+ | 1 | 2 | b | +----+--------+--------+ >>> df.info "DataFrame(['col1', 'col2'], 2 rows, 20 B)" """ self.update_nrows() if self._nrows is None: return "DataFrame([ -- ][ 0 rows : 0 B])" keys = [str(key) for key in list(self._columns)] keys = [("'" + key + "'") for key in keys] keystr = ", ".join(keys) # first call to memory_usage_info() initializes self._bytes mem = self.memory_usage_info() # Get units that make the most sense. if self._bytes < 1024: mem = self.memory_usage_info(unit="B") elif self._bytes < 1024**2: mem = self.memory_usage_info(unit="KB") elif self._bytes < 1024**3: mem = self.memory_usage_info(unit="MB") else: mem = self.memory_usage_info(unit="GB") rows = " rows" if self._nrows == 1: rows = " row" return "DataFrame([" + keystr + "], {:,}".format(self._nrows) + rows + ", " + str(mem) + ")"
[docs] def update_nrows(self): """ Computes the number of rows on the arkouda server and updates the size parameter. """ sizes = set() for key, val in self.items(): if val is not None: sizes.add(val.size) if len(sizes) > 1: raise ValueError("Size mismatch in DataFrame columns.") if len(sizes) == 0: self._nrows = None else: self._nrows = sizes.pop()
@typechecked def _rename_column( self, mapper: Union[Callable, Dict], inplace: bool = False ) -> Optional[DataFrame]: """ Rename columns within the dataframe Parameters ---------- mapper : callable or dict-like Function or dictionary mapping existing columns to new columns. Nonexistent names will not raise an error. inplace: bool, default=False When True, perform the operation on the calling object. When False, return a new object. Returns ------- arkouda.dataframe.DataFrame or None DateFrame when `inplace=False` None when `inplace=True` See Also ------- ak.DataFrame._rename_index ak.DataFrame.rename """ obj = self if inplace else self.copy() if callable(mapper): for i in range(0, len(obj._columns)): oldname = obj._columns[i] newname = mapper(oldname) # Only rename if name has changed if newname != oldname: obj._columns[i] = newname obj.data[newname] = obj.data[oldname] del obj.data[oldname] elif isinstance(mapper, dict): for oldname, newname in mapper.items(): # Only rename if name has changed if newname != oldname: try: i = obj._columns.index(oldname) obj._columns[i] = newname obj.data[newname] = obj.data[oldname] del obj.data[oldname] except Exception: pass else: raise TypeError("Argument must be callable or dict-like") if not inplace: return obj return None @typechecked def _rename_index(self, mapper: Union[Callable, Dict], inplace: bool = False) -> Optional[DataFrame]: """ Rename indexes within the dataframe Parameters ---------- mapper : callable or dict-like Function or dictionary mapping existing indexes to new indexes. Nonexistent names will not raise an error. inplace: bool, default=False When True, perform the operation on the calling object. When False, return a new object. Returns ------- arkouda.dataframe.DataFrame or None DateFrame when `inplace=False` None when `inplace=True` See Also ------- ak.DataFrame._rename_column ak.DataFrame.rename Notes ----- This does not function exactly like pandas. The replacement value here must be the same type as the existing value. """ obj = self if inplace else self.copy() if callable(mapper): for i in range(obj.index.size): oldval = obj.index[i] newval = mapper(oldval) if type(oldval) is not type(newval): raise TypeError("Replacement value must have the same type as the original value") obj.index.values[obj.index.values == oldval] = newval elif isinstance(mapper, dict): for key, val in mapper.items(): if type(key) is not type(val): raise TypeError("Replacement value must have the same type as the original value") obj.index.values[obj.index.values == key] = val else: raise TypeError("Argument must be callable or dict-like") if not inplace: return obj return None
[docs] @typechecked def rename( self, mapper: Optional[Union[Callable, Dict]] = None, index: Optional[Union[Callable, Dict]] = None, column: Optional[Union[Callable, Dict]] = None, axis: Union[str, int] = 0, inplace: bool = False, ) -> Optional[DataFrame]: """ Rename indexes or columns according to a mapping. Parameters ---------- mapper : callable or dict-like, Optional Function or dictionary mapping existing values to new values. Nonexistent names will not raise an error. Uses the value of axis to determine if renaming column or index column : callable or dict-like, Optional Function or dictionary mapping existing column names to new column names. Nonexistent names will not raise an error. When this is set, axis is ignored. index : callable or dict-like, Optional Function or dictionary mapping existing index names to new index names. Nonexistent names will not raise an error. When this is set, axis is ignored. axis: int or str, default=0 Indicates which axis to perform the rename. 0/"index" - Indexes 1/"column" - Columns inplace: bool, default=False When True, perform the operation on the calling object. When False, return a new object. Returns ------- arkouda.dataframe.DataFrame or None DateFrame when `inplace=False`; None when `inplace=True`. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"A": ak.array([1, 2, 3]), "B": ak.array([4, 5, 6])}) >>> display(df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 2 | 5 | +----+-----+-----+ | 2 | 3 | 6 | +----+-----+-----+ Rename columns using a mapping: >>> df.rename(column={'A':'a', 'B':'c'}) +----+-----+-----+ | | a | c | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 2 | 5 | +----+-----+-----+ | 2 | 3 | 6 | +----+-----+-----+ Rename indexes using a mapping: >>> df.rename(index={0:99, 2:11}) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 2 | 5 | +----+-----+-----+ | 2 | 3 | 6 | +----+-----+-----+ Rename using an axis style parameter: >>> df.rename(str.lower, axis='column') +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 1 | 4 | +----+-----+-----+ | 1 | 2 | 5 | +----+-----+-----+ | 2 | 3 | 6 | +----+-----+-----+ """ if column is not None and index is not None: raise RuntimeError("Only column or index can be renamed, cannot rename both at once") # convert the axis to the integer value and validate if isinstance(axis, str): if axis == "column" or axis == "1": axis = 1 elif axis == "index" or axis == "0": axis = 0 else: raise ValueError(f"Unknown axis value {axis}. Expecting 0, 1, 'column' or 'index'.") if column is not None: return self._rename_column(column, inplace) elif mapper is not None and axis == 1: return self._rename_column(mapper, inplace) elif index is not None: return self._rename_index(index, inplace) elif mapper is not None and axis == 0: return self._rename_index(mapper, inplace) else: raise RuntimeError("Rename expects index or columns to be specified.")
[docs] def append(self, other, ordered=True): """ Concatenate data from 'other' onto the end of this DataFrame, in place. Explicitly, use the arkouda concatenate function to append the data from each column in other to the end of self. This operation is done in place, in the sense that the underlying pdarrays are updated from the result of the arkouda concatenate function, rather than returning a new DataFrame object containing the result. Parameters ---------- other : DataFrame The DataFrame object whose data will be appended to this DataFrame. ordered: bool, default=True If False, allow rows to be interleaved for better performance (but data within a row remains together). By default, append all rows to the end, in input order. Returns ------- self Appending occurs in-place, but result is returned for compatibility. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df1 = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ >>> df2 = ak.DataFrame({'col1': [3], 'col2': [5]}) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 3 | 5 | +----+--------+--------+ >>> df1.append(df2) >>> df1 +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ | 2 | 3 | 5 | +----+--------+--------+ """ from arkouda.util import generic_concat as util_concatenate # Do nothing if the other dataframe is empty if other.empty: return self # Check all the columns to make sure they can be concatenated self.update_nrows() keyset = set(self._columns) keylist = list(self._columns) # Allow for starting with an empty dataframe if self.empty: self = other.copy() # Keys don't match elif keyset != set(other._columns): raise KeyError("Key mismatch; keys must be identical in both DataFrames.") # Keys do match else: tmp_data = {} for key in keylist: try: tmp_data[key] = util_concatenate([self[key], other[key]], ordered=ordered) except TypeError as e: raise TypeError( f"Incompatible types for column {key}: {type(self[key])} vs {type(other[key])}" ) from e self.data = tmp_data # Clean up self.update_nrows() self.reset_index(inplace=True) self._empty = False return self
[docs] @classmethod def concat(cls, items, ordered=True): """ Essentially an append, but different formatting. """ from arkouda.util import generic_concat as util_concatenate if len(items) == 0: return cls() first = True columnset = set() columnlist = [] for df in items: # Allow for an empty dataframe if df.empty: continue if first: columnset = set(df._columns) columnlist = df._columns first = False else: if set(df._columns) != columnset: raise KeyError("Cannot concatenate DataFrames with mismatched columns") # if here, columns match ret = cls() for col in columnlist: try: ret[col] = util_concatenate([df[col] for df in items], ordered=ordered) except TypeError: raise TypeError(f"Incompatible types for column {col}") return ret
[docs] def head(self, n=5): """ Return the first `n` rows. This function returns the first `n` rows of the the dataframe. It is useful for quickly verifying data, for example, after sorting or appending rows. Parameters ---------- n : int, default = 5 Number of rows to select. Returns ------- arkouda.dataframe.DataFrame The first `n` rows of the DataFrame. See Also -------- tail Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': ak.arange(10), 'col2': -1 * ak.arange(10)}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 1 | -1 | +----+--------+--------+ | 2 | 2 | -2 | +----+--------+--------+ | 3 | 3 | -3 | +----+--------+--------+ | 4 | 4 | -4 | +----+--------+--------+ | 5 | 5 | -5 | +----+--------+--------+ | 6 | 6 | -6 | +----+--------+--------+ | 7 | 7 | -7 | +----+--------+--------+ | 8 | 8 | -8 | +----+--------+--------+ | 9 | 9 | -9 | +----+--------+--------+ >>> df.head() +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 1 | -1 | +----+--------+--------+ | 2 | 2 | -2 | +----+--------+--------+ | 3 | 3 | -3 | +----+--------+--------+ | 4 | 4 | -4 | +----+--------+--------+ >>> df.head(n=2) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 1 | -1 | +----+--------+--------+ """ return self[:n]
[docs] def tail(self, n=5): """ Return the last `n` rows. This function returns the last `n` rows for the dataframe. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : int, default=5 Number of rows to select. Returns ------- arkouda.dataframe.DataFrame The last `n` rows of the DataFrame. See Also -------- arkouda.dataframe.head Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': ak.arange(10), 'col2': -1 * ak.arange(10)}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 1 | -1 | +----+--------+--------+ | 2 | 2 | -2 | +----+--------+--------+ | 3 | 3 | -3 | +----+--------+--------+ | 4 | 4 | -4 | +----+--------+--------+ | 5 | 5 | -5 | +----+--------+--------+ | 6 | 6 | -6 | +----+--------+--------+ | 7 | 7 | -7 | +----+--------+--------+ | 8 | 8 | -8 | +----+--------+--------+ | 9 | 9 | -9 | +----+--------+--------+ >>> df.tail() +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 5 | -5 | +----+--------+--------+ | 1 | 6 | -6 | +----+--------+--------+ | 2 | 7 | -7 | +----+--------+--------+ | 3 | 8 | -8 | +----+--------+--------+ | 4 | 9 | -9 | +----+--------+--------+ >>> df.tail(n=2) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 8 | -8 | +----+--------+--------+ | 1 | 9 | -9 | +----+--------+--------+ """ self.update_nrows() if self._nrows <= n: return self return self[self._nrows - n :]
[docs] def sample(self, n=5): """ Return a random sample of `n` rows. Parameters ---------- n : int, default=5 Number of rows to return. Returns ------- arkouda.dataframe.DataFrame The sampled `n` rows of the DataFrame. Example ------- >>> df = ak.DataFrame({"A": ak.arange(5), "B": -1 * ak.arange(5)}) >>> display(df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | -1 | +----+-----+-----+ | 2 | 2 | -2 | +----+-----+-----+ | 3 | 3 | -3 | +----+-----+-----+ | 4 | 4 | -4 | +----+-----+-----+ Random output of size 3: >>> df.sample(n=3) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | -1 | +----+-----+-----+ | 2 | 4 | -4 | +----+-----+-----+ """ self.update_nrows() if self._nrows <= n: return self return self[array(random.sample(range(self._nrows), n))]
[docs] def GroupBy(self, keys, use_series=False, as_index=True, dropna=True): """ Group the dataframe by a column or a list of columns. Parameters ---------- keys : str or list of str An (ordered) list of column names or a single string to group by. use_series : bool, default=False If True, returns an arkouda.dataframe.DataFrameGroupBy object. Otherwise an arkouda.groupbyclass.GroupBy object. as_index: bool, default=True If True, groupby columns will be set as index otherwise, the groupby columns will be treated as DataFrame columns. dropna : bool, default=True If True, and the groupby keys contain NaN values, the NaN values together with the corresponding row will be dropped. Otherwise, the rows corresponding to NaN values will be kept. Returns ------- arkouda.dataframe.DataFrameGroupBy or arkouda.groupbyclass.GroupBy If use_series = True, returns an arkouda.dataframe.DataFrameGroupBy object. Otherwise returns an arkouda.groupbyclass.GroupBy object. See Also -------- arkouda.GroupBy Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1.0, 1.0, 2.0, np.nan], 'col2': [4, 5, 6, 7]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 1 | 5 | +----+--------+--------+ | 2 | 2 | 6 | +----+--------+--------+ | 3 | nan | 7 | +----+--------+--------+ >>> df.GroupBy("col1") <arkouda.groupbyclass.GroupBy at 0x7f2cf23e10c0> >>> df.GroupBy("col1").size() (array([1.00000000000000000 2.00000000000000000]), array([2 1])) >>> df.GroupBy("col1",use_series=True) col1 1.0 2 2.0 1 dtype: int64 >>> df.GroupBy("col1",use_series=True, as_index = False).size() +----+--------+--------+ | | col1 | size | +====+========+========+ | 0 | 1 | 2 | +----+--------+--------+ | 1 | 2 | 1 | +----+--------+--------+ """ self.update_nrows() if isinstance(keys, str): cols = self.data[keys] elif not isinstance(keys, (list, tuple)): raise TypeError("keys must be a column name or a list/tuple of column names") elif len(keys) == 1: cols = self.data[keys[0]] else: cols = [self.data[col] for col in keys] gb = akGroupBy(cols, dropna=dropna) if use_series: gb = DataFrameGroupBy(gb, self, gb_key_names=keys, as_index=as_index) return gb
[docs] def memory_usage(self, index=True, unit="B") -> Series: """ Return the memory usage of each column in bytes. The memory usage can optionally include the contribution of the index. Parameters ---------- index : bool, default True Specifies whether to include the memory usage of the DataFrame's index in returned Series. If ``index=True``, the memory usage of the index is the first item in the output. unit : str, default = "B" Unit to return. One of {'B', 'KB', 'MB', 'GB'}. Returns ------- Series A Series whose index is the original column names and whose values is the memory usage of each column in bytes. See Also -------- arkouda.pdarrayclass.nbytes arkouda.index.Index.memory_usage arkouda.index.MultiIndex.memory_usage arkouda.series.Series.memory_usage Examples -------- >>> import arkouda as ak >>> ak.connect() >>> dtypes = [ak.int64, ak.float64, ak.bool] >>> data = dict([(str(t), ak.ones(5000, dtype=ak.int64).astype(t)) for t in dtypes]) >>> df = ak.DataFrame(data) >>> display(df.head()) +----+---------+-----------+--------+ | | int64 | float64 | bool | +====+=========+===========+========+ | 0 | 1 | 1 | True | +----+---------+-----------+--------+ | 1 | 1 | 1 | True | +----+---------+-----------+--------+ | 2 | 1 | 1 | True | +----+---------+-----------+--------+ | 3 | 1 | 1 | True | +----+---------+-----------+--------+ | 4 | 1 | 1 | True | +----+---------+-----------+--------+ >>> df.memory_usage() +---------+-------+ | | 0 | +=========+=======+ | Index | 40000 | +---------+-------+ | int64 | 40000 | +---------+-------+ | float64 | 40000 | +---------+-------+ | bool | 5000 | +---------+-------+ >>> df.memory_usage(index=False) +---------+-------+ | | 0 | +=========+=======+ | int64 | 40000 | +---------+-------+ | float64 | 40000 | +---------+-------+ | bool | 5000 | +---------+-------+ >>> df.memory_usage(unit="KB") +---------+----------+ | | 0 | +=========+==========+ | Index | 39.0625 | +---------+----------+ | int64 | 39.0625 | +---------+----------+ | float64 | 39.0625 | +---------+----------+ | bool | 4.88281 | +---------+----------+ To get the approximate total memory usage: >>> df.memory_usage(index=True).sum() """ from arkouda.util import convert_bytes if index: sizes = [self.index.memory_usage(unit=unit)] ret_index = ["Index"] else: sizes = [] ret_index = [] sizes += [convert_bytes(c.nbytes, unit=unit) for col, c in self.items()] ret_index += self.columns.values.copy() result = Series(sizes, index=array(ret_index)) return result
[docs] def memory_usage_info(self, unit="GB"): """ A formatted string representation of the size of this DataFrame. Parameters ---------- unit : str, default = "GB" Unit to return. One of {'KB', 'MB', 'GB'}. Returns ------- str A string representation of the number of bytes used by this DataFrame in [unit]s. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': ak.arange(1000), 'col2': ak.arange(1000)}) >>> df.memory_usage_info() '0.00 GB' >>> df.memory_usage_info(unit="KB") '15 KB' """ from arkouda.util import convert_bytes data_size = convert_bytes(self.memory_usage(index=True).sum(), unit=unit) return "{:.2f} {}".format(data_size, unit)
[docs] def to_pandas(self, datalimit=maxTransferBytes, retain_index=False): """ Send this DataFrame to a pandas DataFrame. Parameters ---------- datalimit : int, default=arkouda.client.maxTransferBytes The maximum number size, in megabytes to transfer. The requested DataFrame will be converted to a pandas DataFrame only if the estimated size of the DataFrame does not exceed this value. retain_index : bool, default=False Normally, to_pandas() creates a new range index object. If you want to keep the index column, set this to True. Returns ------- pandas.DataFrame The result of converting this DataFrame to a pandas DataFrame. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> ak_df = ak.DataFrame({"A": ak.arange(2), "B": -1 * ak.arange(2)}) >>> type(ak_df) arkouda.dataframe.DataFrame >>> display(ak_df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | -1 | +----+-----+-----+ >>> import pandas as pd >>> pd_df = ak_df.to_pandas() >>> type(pd_df) pandas.core.frame.DataFrame >>> display(pd_df) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | -1 | +----+-----+-----+ """ self.update_nrows() # Estimate how much memory would be required for this DataFrame nbytes = 0 for key, val in self.items(): if isinstance(val, pdarray): nbytes += (val.dtype).itemsize * self._nrows elif isinstance(val, Strings): nbytes += val.nbytes elif isinstance(val, Categorical): nbytes += val.codes.nbytes nbytes += val.categories.nbytes KB = 1024 MB = KB * KB GB = MB * KB # Get units that make the most sense. msg = "" if nbytes < KB: msg = "{:,} B".format(nbytes) elif nbytes < MB: msg = "{:,} KB".format(int(nbytes / KB)) elif nbytes < GB: msg = "{:,} MB".format(int(nbytes / MB)) print(f"This transfer will use {msg} .") else: msg = "{:,} GB".format(int(nbytes / GB)) print(f"This will transfer {msg} from arkouda to pandas.") # If the total memory transfer requires more than `datalimit` per # column, we will warn the user and return. if nbytes > (datalimit * len(self._columns) * MB): msg = f"This operation would transfer more than {datalimit} bytes." warn(msg, UserWarning) return None # Proceed with conversion if possible pandas_data = {} for key in self._columns: val = self[key] try: # in order for proper pandas functionality, SegArrays must be seen as 1d # and therefore need to be converted to list if isinstance(val, SegArray): pandas_data[key] = val.to_list() elif isinstance(val, Categorical): pandas_data[key] = val.to_pandas() else: pandas_data[key] = val.to_ndarray() except TypeError: raise IndexError("Bad index type or format.") # Return a new dataframe with original indices if requested. if retain_index and self.index is not None: index = self.index.to_pandas() return pd.DataFrame(data=pandas_data, index=index) else: return pd.DataFrame(data=pandas_data)
[docs] def to_markdown(self, mode="wt", index=True, tablefmt="grid", storage_options=None, **kwargs): r""" Print DataFrame in Markdown-friendly format. Parameters ---------- mode : str, optional Mode in which file is opened, "wt" by default. index : bool, optional, default True Add index (row) labels. tablefmt: str = "grid" Table format to call from tablulate: https://pypi.org/project/tabulate/ storage_options: dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting “s3://”, “gcs://”. An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. **kwargs These parameters will be passed to tabulate. Note ---- This function should only be called on small DataFrames as it calls pandas.DataFrame.to_markdown: https://pandas.pydata.org/pandas-docs/version/1.2.4/reference/api/pandas.DataFrame.to_markdown.html Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}) >>> print(df.to_markdown()) +----+------------+------------+ | | animal_1 | animal_2 | +====+============+============+ | 0 | elk | dog | +----+------------+------------+ | 1 | pig | quetzal | +----+------------+------------+ Suppress the index: >>> print(df.to_markdown(index = False)) +------------+------------+ | animal_1 | animal_2 | +============+============+ | elk | dog | +------------+------------+ | pig | quetzal | +------------+------------+ """ return self.to_pandas().to_markdown( mode=mode, index=index, tablefmt=tablefmt, storage_options=storage_options, **kwargs )
def _prep_data(self, index=False, columns=None): # if no columns are stored, we will save all columns if columns is None: data = self.data else: data = {c: self.data[c] for c in columns} if index: data["Index"] = self.index.values return data
[docs] def to_hdf(self, path, index=False, columns=None, file_type="distribute"): """ Save DataFrame to disk as hdf5, preserving column names. Parameters ---------- path : str File path to save data. index : bool, default=False If True, save the index column. By default, do not save the index. columns: List, default = None List of columns to include in the file. If None, writes out all columns. file_type: str (single | distribute), default=distribute Whether to save to a single file or distribute across Locales. Returns ------- None Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray. Notes ----- This method saves one file per locale of the arkouda server. All files are prefixed by the path argument and suffixed by their locale number. See Also --------- to_parquet load Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'hdf_output') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A":[1,2],"B":[3,4]}) >>> df.to_hdf(my_path + "/my_data") >>> df.load(my_path + "/my_data") +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ """ from arkouda.io import to_hdf data = self._prep_data(index=index, columns=columns) to_hdf(data, prefix_path=path, file_type=file_type)
def _to_hdf_snapshot(self, path, dataset="DataFrame", mode="truncate", file_type="distribute"): """ Save a dataframe as a group with columns within the group. This allows saving other datasets in the HDF5 file without impacting the integrity of the dataframe This is only used for the snapshot workflow Parameters ---------- path : str File path to save data dataset: str Name to save the dataframe under within the file Only used when as_dataset=True mode: str (truncate | append), default=truncate Indicates whether the dataset should truncate the file and write or append to the file Only used when as_dataset=True file_type: str (single | distribute), default=distribute Whether to save to a single file or distribute across Locales Only used when as_dataset=True Returns ------- None Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray """ from arkouda.categorical import Categorical as Categorical_ from arkouda.io import _file_type_to_int, _mode_str_to_int column_data = [ ( obj.name if not isinstance(obj, (Categorical_, SegArray)) else ( json.dumps( { "codes": obj.codes.name, "categories": obj.categories.name, "NA_codes": obj._akNAcode.name, **( {"permutation": obj.permutation.name} if obj.permutation is not None else {} ), **({"segments": obj.segments.name} if obj.segments is not None else {}), } ) if isinstance(obj, Categorical_) else json.dumps({"segments": obj.segments.name, "values": obj.values.name}) ) ) for k, obj in self.items() ] dtypes = [ str(obj.categories.dtype) if isinstance(obj, Categorical_) else str(obj.dtype) for obj in self.values() ] col_objTypes = [ obj.special_objType if hasattr(obj, "special_objType") else obj.objType for obj in self.values() ] return cast( str, generic_msg( cmd="tohdf", args={ "filename": path, "dset": dataset, "file_format": _file_type_to_int(file_type), "write_mode": _mode_str_to_int(mode), "objType": self.objType, "num_cols": len(self.columns.values), "column_names": self.columns.values, "column_objTypes": col_objTypes, "column_dtypes": dtypes, "columns": column_data, "index": self.index.values.name, }, ), )
[docs] def update_hdf(self, prefix_path: str, index=False, columns=None, repack: bool = True): """ Overwrite the dataset with the name provided with this dataframe. If the dataset does not exist it is added. Parameters ---------- prefix_path : str Directory and filename prefix that all output files share. index : bool, default=False If True, save the index column. By default, do not save the index. columns: List, default=None List of columns to include in the file. If None, writes out all columns. repack: bool, default=True HDF5 does not release memory on delete. When True, the inaccessible data (that was overwritten) is removed. When False, the data remains, but is inaccessible. Setting to false will yield better performance, but will cause file sizes to expand. Returns ------- str Success message if successful. Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray. Notes ----- If file does not contain File_Format attribute to indicate how it was saved, the file name is checked for _LOCALE#### to determine if it is distributed. If the dataset provided does not exist, it will be added. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'hdf_output') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A":[1,2],"B":[3,4]}) >>> df.to_hdf(my_path + "/my_data") >>> df.load(my_path + "/my_data") +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ >>> df2 = ak.DataFrame({"A":[5,6],"B":[7,8]}) >>> df2.update_hdf(my_path + "/my_data") >>> df.load(my_path + "/my_data") +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 5 | 7 | +----+-----+-----+ | 1 | 6 | 8 | +----+-----+-----+ """ from arkouda.io import update_hdf data = self._prep_data(index=index, columns=columns) update_hdf(data, prefix_path=prefix_path, repack=repack)
[docs] def to_parquet( self, path, index=False, columns=None, compression: Optional[str] = None, convert_categoricals: bool = False, ): """ Save DataFrame to disk as parquet, preserving column names. Parameters ---------- path : str File path to save data. index : bool, default=False If True, save the index column. By default, do not save the index. columns: list List of columns to include in the file. If None, writes out all columns. compression : str (Optional), default=None Provide the compression type to use when writing the file. Supported values: snappy, gzip, brotli, zstd, lz4 convert_categoricals: bool, default=False Parquet requires all columns to be the same size and Categoricals don't satisfy that requirement. If set, write the equivalent Strings in place of any Categorical columns. Returns ------- None Raises ------ RuntimeError Raised if a server-side error is thrown saving the pdarray Notes ----- This method saves one file per locale of the arkouda server. All files are prefixed by the path argument and suffixed by their locale number. See Also --------- to_hdf load Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'parquet_output') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A":[1,2],"B":[3,4]}) >>> df.to_parquet(my_path + "/my_data") >>> df.load(my_path + "/my_data") +----+-----+-----+ | | B | A | +====+=====+=====+ | 0 | 3 | 1 | +----+-----+-----+ | 1 | 4 | 2 | +----+-----+-----+ """ from arkouda.io import to_parquet data = self._prep_data(index=index, columns=columns) if not convert_categoricals and any(isinstance(val, Categorical) for val in data.values()): raise ValueError( "to_parquet doesn't support Categorical columns. To write the equivalent " "Strings in place of any Categorical columns, rerun with convert_categoricals " "set to True." ) to_parquet( data, prefix_path=path, compression=compression, convert_categoricals=convert_categoricals, )
[docs] @typechecked def to_csv( self, path: str, index: bool = False, columns: Optional[List[str]] = None, col_delim: str = ",", overwrite: bool = False, ): r""" Writes DataFrame to CSV file(s). File will contain a column for each column in the DataFrame. All CSV Files written by Arkouda include a header denoting data types of the columns. Unlike other file formats, CSV files store Strings as their UTF-8 format instead of storing bytes as uint(8). Parameters ---------- path: str The filename prefix to be used for saving files. Files will have _LOCALE#### appended when they are written to disk. index: bool, default=False If True, the index of the DataFrame will be written to the file as a column. columns: list of str (Optional) Column names to assign when writing data. col_delim: str, default="," Value to be used to separate columns within the file. Please be sure that the value used DOES NOT appear in your dataset. overwrite: bool, default=False If True, any existing files matching your provided prefix_path will be overwritten. If False, an error will be returned if existing files are found. Returns ------- None Raises ------ ValueError Raised if all datasets are not present in all parquet files or if one or more of the specified files do not exist. RuntimeError Raised if one or more of the specified files cannot be opened. If `allow_errors` is true this may be raised if no values are returned from the server. TypeError Raised if we receive an unknown arkouda_type returned from the server. Notes ----- - CSV format is not currently supported by load/load_all operations. - The column delimiter is expected to be the same for column names and data. - Be sure that column delimiters are not found within your data. - All CSV files must delimit rows using newline ("\\n") at this time. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'csv_output') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A":[1,2],"B":[3,4]}) >>> df.to_csv(my_path + "/my_data") >>> df2 = DataFrame.read_csv(my_path + "/my_data" + "_LOCALE0000") >>> display(df2) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ """ from arkouda.io import to_csv data = self._prep_data(index=index, columns=columns) to_csv(data, path, names=columns, col_delim=col_delim, overwrite=overwrite)
[docs] @classmethod def read_csv(cls, filename: str, col_delim: str = ","): r""" Read the columns of a CSV file into an Arkouda DataFrame. If the file contains the appropriately formatted header, typed data will be returned. Otherwise, all data will be returned as a Strings objects. Parameters ---------- filename: str Filename to read data from. col_delim: str, default="," The delimiter for columns within the data. Returns ------- arkouda.dataframe.DataFrame Arkouda DataFrame containing the columns from the CSV file. Raises ------ ValueError Raised if all datasets are not present in all parquet files or if one or more of the specified files do not exist. RuntimeError Raised if one or more of the specified files cannot be opened. If `allow_errors` is true this may be raised if no values are returned from the server. TypeError Raised if we receive an unknown arkouda_type returned from the server. See Also -------- to_csv Notes ------ - CSV format is not currently supported by load/load_all operations. - The column delimiter is expected to be the same for column names and data. - Be sure that column delimiters are not found within your data. - All CSV files must delimit rows using newline ("\\n") at this time. - Unlike other file formats, CSV files store Strings as their UTF-8 format instead of storing bytes as uint(8). Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'csv_output','my_data') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A":[1,2],"B":[3,4]}) >>> df.to_csv(my_path) >>> df2 = DataFrame.read_csv(my_path + "_LOCALE0000") >>> display(df2) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 1 | 3 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ """ from arkouda.io import read_csv data = read_csv(filename, column_delim=col_delim) return cls(data)
[docs] def save( self, path, index=False, columns=None, file_format="HDF5", file_type="distribute", compression: Optional[str] = None, ): """ DEPRECATED Save DataFrame to disk, preserving column names. Parameters ---------- path : str File path to save data. index : bool, default=False If True, save the index column. By default, do not save the index. columns: list, default=None List of columns to include in the file. If None, writes out all columns. file_format : str, default='HDF5' 'HDF5' or 'Parquet'. Defaults to 'HDF5' file_type : str, default=distribute "single" or "distribute" If single, will right a single file to locale 0. compression: str (Optional) (None | "snappy" | "gzip" | "brotli" | "zstd" | "lz4") Compression type. Only used for Parquet Notes ----- This method saves one file per locale of the arkouda server. All files are prefixed by the path argument and suffixed by their locale number. See Also -------- to_parquet, to_hdf Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'hdf5_output') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A": ak.arange(5), "B": -1 * ak.arange(5)}) >>> df.save(my_path + '/my_data', file_type="single") >>> df.load(my_path + '/my_data') +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | -1 | +----+-----+-----+ | 2 | 2 | -2 | +----+-----+-----+ | 3 | 3 | -3 | +----+-----+-----+ | 4 | 4 | -4 | +----+-----+-----+ """ warn( "ak.DataFrame.save has been deprecated. " "Please use ak.DataFrame.to_hdf or ak.DataFrame.to_parquet", DeprecationWarning, ) if file_format.lower() == "hdf5": return self.to_hdf(path, index=index, columns=columns, file_type=file_type) elif file_format.lower() == "parquet": return self.to_parquet(path, index=index, columns=columns, compression=compression) else: raise ValueError("Valid file types are HDF5 or Parquet")
[docs] @classmethod def load(cls, prefix_path, file_format="INFER"): """ Load dataframe from file. file_format needed for consistency with other load functions. Parameters ---------- prefix_path : str The prefix path for the data. file_format : string, default = "INFER" Returns ------- arkouda.dataframe.DataFrame A dataframe loaded from the prefix_path. Examples -------- To store data in <my_dir>/my_data_LOCALE0000, use "<my_dir>/my_data" as the prefix. >>> import arkouda as ak >>> ak.connect() >>> import os.path >>> from pathlib import Path >>> my_path = os.path.join(os.getcwd(), 'hdf5_output','my_data') >>> Path(my_path).mkdir(parents=True, exist_ok=True) >>> df = ak.DataFrame({"A": ak.arange(5), "B": -1 * ak.arange(5)}) >>> df.save(my_path, file_type="distribute") >>> df.load(my_path) +----+-----+-----+ | | A | B | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | -1 | +----+-----+-----+ | 2 | 2 | -2 | +----+-----+-----+ | 3 | 3 | -3 | +----+-----+-----+ | 4 | 4 | -4 | +----+-----+-----+ """ from arkouda.io import ( _dict_recombine_segarrays_categoricals, get_filetype, load_all, ) prefix, extension = os.path.splitext(prefix_path) first_file = f"{prefix}_LOCALE0000{extension}" filetype = get_filetype(first_file) if file_format.lower() == "infer" else file_format # columns load backwards df = cls(_dict_recombine_segarrays_categoricals(load_all(prefix_path, file_format=filetype))) # if parquet, return reversed dataframe to match what was saved return df if filetype == "HDF5" else df[df.columns.values[::-1]]
[docs] def argsort(self, key, ascending=True): """ Return the permutation that sorts the dataframe by `key`. Parameters ---------- key : str The key to sort on. ascending : bool, default = True If true, sort the key in ascending order. Otherwise, sort the key in descending order. Returns ------- arkouda.pdarrayclass.pdarray The permutation array that sorts the data on `key`. See Also -------- coargsort Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1.1, 3.1, 2.1], 'col2': [6, 5, 4]}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1.1 | 6 | +----+--------+--------+ | 1 | 3.1 | 5 | +----+--------+--------+ | 2 | 2.1 | 4 | +----+--------+--------+ >>> df.argsort('col1') array([0 2 1]) >>> sorted_df1 = df[df.argsort('col1')] >>> display(sorted_df1) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1.1 | 6 | +----+--------+--------+ | 1 | 2.1 | 4 | +----+--------+--------+ | 2 | 3.1 | 5 | +----+--------+--------+ >>> df.argsort('col2') array([2 1 0]) >>> sorted_df2 = df[df.argsort('col2')] >>> display(sorted_df2) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 2.1 | 4 | +----+--------+--------+ | 1 | 3.1 | 5 | +----+--------+--------+ | 2 | 1.1 | 6 | +----+--------+--------+ """ if self._empty: return array([], dtype=akint64) if ascending: return argsort(self[key]) else: if isinstance(self[key], pdarray) and self[key].dtype in ( akint64, akfloat64, ): return argsort(-self[key]) else: return argsort(self[key])[arange(self._nrows - 1, -1, -1)]
[docs] def coargsort(self, keys, ascending=True): """ Return the permutation that sorts the dataframe by `keys`. Note: Sorting using Strings may not yield correct sort order. Parameters ---------- keys : list of str The keys to sort on. Returns ------- arkouda.pdarrayclass.pdarray The permutation array that sorts the data on `keys`. Example ------- >>> df = ak.DataFrame({'col1': [2, 2, 1], 'col2': [3, 4, 3], 'col3':[5, 6, 7]}) >>> display(df) +----+--------+--------+--------+ | | col1 | col2 | col3 | +====+========+========+========+ | 0 | 2 | 3 | 5 | +----+--------+--------+--------+ | 1 | 2 | 4 | 6 | +----+--------+--------+--------+ | 2 | 1 | 3 | 7 | +----+--------+--------+--------+ >>> df.coargsort(['col1', 'col2']) array([2 0 1]) >>> """ if self._empty: return array([], dtype=akint64) arrays = [] for key in keys: arrays.append(self[key]) i = coargsort(arrays) if not ascending: i = i[arange(self._nrows - 1, -1, -1)] return i
def _reindex(self, idx): if isinstance(self.index, MultiIndex): new_index = MultiIndex(self.index[idx].values, name=self.index.name, names=self.index.names) elif isinstance(self.index, Index): new_index = Index(self.index[idx], name=self.index.name) else: new_index = Index(self.index[idx]) return DataFrame(self[idx], index=new_index)
[docs] def sort_index(self, ascending=True): """ Sort the DataFrame by indexed columns. Note: Fails on sort order of arkouda.strings.Strings columns when multiple columns being sorted. Parameters ---------- ascending : bool, default = True Sort values in ascending (default) or descending order. Example ------- >>> df = ak.DataFrame({'col1': [1.1, 3.1, 2.1], 'col2': [6, 5, 4]}, ... index = Index(ak.array([2,0,1]), name="idx")) >>> display(df) +----+--------+--------+ | idx| col1 | col2 | +====+========+========+ | 0 | 1.1 | 6 | +----+--------+--------+ | 1 | 3.1 | 5 | +----+--------+--------+ | 2 | 2.1 | 4 | +----+--------+--------+ >>> df.sort_index() +----+--------+--------+ | idx| col1 | col2 | +====+========+========+ | 0 | 3.1 | 5 | +----+--------+--------+ | 1 | 2.1 | 4 | +----+--------+--------+ | 2 | 1.1 | 6 | +----+--------+--------+ """ idx = self.index.argsort(ascending=ascending) return self._reindex(idx)
[docs] def sort_values(self, by=None, ascending=True): """ Sort the DataFrame by one or more columns. If no column is specified, all columns are used. Note: Fails on order of arkouda.strings.Strings columns when multiple columns being sorted. Parameters ---------- by : str or list/tuple of str, default = None The name(s) of the column(s) to sort by. ascending : bool, default = True Sort values in ascending (default) or descending order. See Also -------- apply_permutation Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [2, 2, 1], 'col2': [3, 4, 3], 'col3':[5, 6, 7]}) >>> display(df) +----+--------+--------+--------+ | | col1 | col2 | col3 | +====+========+========+========+ | 0 | 2 | 3 | 5 | +----+--------+--------+--------+ | 1 | 2 | 4 | 6 | +----+--------+--------+--------+ | 2 | 1 | 3 | 7 | +----+--------+--------+--------+ >>> df.sort_values() +----+--------+--------+--------+ | | col1 | col2 | col3 | +====+========+========+========+ | 0 | 1 | 3 | 7 | +----+--------+--------+--------+ | 1 | 2 | 3 | 5 | +----+--------+--------+--------+ | 2 | 2 | 4 | 6 | +----+--------+--------+--------+ >>> df.sort_values("col3") +----+--------+--------+--------+ | | col1 | col2 | col3 | +====+========+========+========+ | 0 | 1 | 3 | 7 | +----+--------+--------+--------+ | 1 | 2 | 3 | 5 | +----+--------+--------+--------+ | 2 | 2 | 4 | 6 | +----+--------+--------+--------+ """ if self._empty: return array([], dtype=akint64) if by is None: if len(self._columns) == 1: i = self.argsort(self._columns[0], ascending=ascending) else: i = self.coargsort(self._columns, ascending=ascending) elif isinstance(by, str): i = self.argsort(by, ascending=ascending) elif isinstance(by, (list, tuple)): i = self.coargsort(by, ascending=ascending) else: raise TypeError("Column name(s) must be str or list/tuple of str") return self[i]
[docs] def apply_permutation(self, perm): """ Apply a permutation to an entire DataFrame. The operation is done in place and the original DataFrame will be modified. This may be useful if you want to unsort an DataFrame, or even to apply an arbitrary permutation such as the inverse of a sorting permutation. Parameters ---------- perm : pdarray A permutation array. Should be the same size as the data arrays, and should consist of the integers [0,size-1] in some order. Very minimal testing is done to ensure this is a permutation. Returns ------- None See Also -------- sort Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 2 | 5 | +----+--------+--------+ | 2 | 3 | 6 | +----+--------+--------+ >>> perm_arry = ak.array([0, 2, 1]) >>> df.apply_permutation(perm_arry) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 3 | 6 | +----+--------+--------+ | 2 | 2 | 5 | +----+--------+--------+ """ if (perm.min() != 0) or (perm.max() != perm.size - 1): raise ValueError("The indicated permutation is invalid.") if unique(perm).size != perm.size: raise ValueError("The indicated permutation is invalid.") for key, val in self.data.items(): self[key] = self[key][perm] self._set_index(self.index[perm])
[docs] def filter_by_range(self, keys, low=1, high=None): """ Find all rows where the value count of the items in a given set of columns (keys) is within the range [low, high]. To filter by a specific value, set low == high. Parameters ---------- keys : str or list of str The names of the columns to group by. low : int, default=1 The lowest value count. high : int, default=None The highest value count, default to unlimited. Returns ------- arkouda.pdarrayclass.pdarray An array of boolean values for qualified rows in this DataFrame. Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 2, 2, 3, 3], 'col2': [4, 5, 6, 7, 8, 9]}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 2 | 5 | +----+--------+--------+ | 2 | 2 | 6 | +----+--------+--------+ | 3 | 2 | 7 | +----+--------+--------+ | 4 | 3 | 8 | +----+--------+--------+ | 5 | 3 | 9 | +----+--------+--------+ >>> df.filter_by_range("col1", low=1, high=2) array([True False False False True True]) >>> filtered_df = df[df.filter_by_range("col1", low=1, high=2)] >>> display(filtered_df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 3 | 8 | +----+--------+--------+ | 2 | 3 | 9 | +----+--------+--------+ """ if isinstance(keys, str): keys = [keys] gb = self.GroupBy(keys, use_series=False) vals, cts = gb.size() if not high: positions = where(cts >= low, 1, 0) else: positions = where(((cts >= low) & (cts <= high)), 1, 0) broadcast = gb.broadcast(positions, permute=False) broadcast = broadcast == 1 return broadcast[invert_permutation(gb.permutation)]
def copy(self, deep=True): """ Make a copy of this object's data. When `deep = True` (default), a new object will be created with a copy of the calling object's data. Modifications to the data of the copy will not be reflected in the original object. When `deep = False` a new object will be created without copying the calling object's data. Any changes to the data of the original object will be reflected in the shallow copy, and vice versa. Parameters ---------- deep : bool, default=True When True, return a deep copy. Otherwise, return a shallow copy. Returns ------- arkouda.dataframe.DataFrame A deep or shallow copy according to caller specification. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ >>> df_deep = df.copy(deep=True) >>> df_deep['col1'] +=1 >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 3 | +----+--------+--------+ | 1 | 2 | 4 | +----+--------+--------+ >>> df_shallow = df.copy(deep=False) >>> df_shallow['col1'] +=1 >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 2 | 3 | +----+--------+--------+ | 1 | 3 | 4 | +----+--------+--------+ """ if deep is True: res = DataFrame() res._size = self._nrows res._bytes = self._bytes res._empty = self._empty res._columns = self._columns[:] # if this is not a slice, droping columns modifies both for key, val in self.items(): res[key] = val[:] # if this is not a slice, renaming indexes with update both res._set_index(Index(self.index.index[:])) return res else: return DataFrame(self)
[docs] def groupby(self, keys, use_series=True, as_index=True, dropna=True): """ Group the dataframe by a column or a list of columns. Alias for GroupBy. Parameters ---------- keys : str or list of str An (ordered) list of column names or a single string to group by. use_series : bool, default=True If True, returns an arkouda.dataframe.DataFrameGroupBy object. Otherwise an arkouda.groupbyclass.GroupBy object. as_index: bool, default=True If True, groupby columns will be set as index otherwise, the groupby columns will be treated as DataFrame columns. dropna : bool, default=True If True, and the groupby keys contain NaN values, the NaN values together with the corresponding row will be dropped. Otherwise, the rows corresponding to NaN values will be kept. Returns ------- arkouda.dataframe.DataFrameGroupBy or arkouda.groupbyclass.GroupBy If use_series = True, returns an arkouda.dataframe.DataFrameGroupBy object. Otherwise returns an arkouda.groupbyclass.GroupBy object. See Also -------- arkouda.GroupBy Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col1': [1.0, 1.0, 2.0, np.nan], 'col2': [4, 5, 6, 7]}) >>> df +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | 4 | +----+--------+--------+ | 1 | 1 | 5 | +----+--------+--------+ | 2 | 2 | 6 | +----+--------+--------+ | 3 | nan | 7 | +----+--------+--------+ >>> df.GroupBy("col1") <arkouda.groupbyclass.GroupBy at 0x7f2cf23e10c0> >>> df.GroupBy("col1").size() (array([1.00000000000000000 2.00000000000000000]), array([2 1])) >>> df.GroupBy("col1",use_series=True) col1 1.0 2 2.0 1 dtype: int64 >>> df.GroupBy("col1",use_series=True, as_index = False).size() +----+--------+--------+ | | col1 | size | +====+========+========+ | 0 | 1 | 2 | +----+--------+--------+ | 1 | 2 | 1 | +----+--------+--------+ """ return self.GroupBy(keys, use_series, as_index=as_index, dropna=dropna)
[docs] @typechecked def isin(self, values: Union[pdarray, Dict, Series, DataFrame]) -> DataFrame: """ Determine whether each element in the DataFrame is contained in values. Parameters __________ values : pdarray, dict, Series, or DataFrame The values to check for in DataFrame. Series can only have a single index. Returns _______ arkouda.dataframe.DataFrame Arkouda DataFrame of booleans showing whether each element in the DataFrame is contained in values. See Also ________ ak.Series.isin Notes _____ - Pandas supports values being an iterable type. In arkouda, we replace this with pdarray. - Pandas supports ~ operations. Currently, ak.DataFrame does not support this. Examples ________ >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({'col_A': ak.array([7, 3]), 'col_B':ak.array([1, 9])}) >>> display(df) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | 7 | 1 | +----+---------+---------+ | 1 | 3 | 9 | +----+---------+---------+ When `values` is a pdarray, check every value in the DataFrame to determine if it exists in values. >>> df.isin(ak.array([0, 1])) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | 0 | 1 | +----+---------+---------+ | 1 | 0 | 0 | +----+---------+---------+ When `values` is a dict, the values in the dict are passed to check the column indicated by the key. >>> df.isin({'col_A': ak.array([0, 3])}) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | 0 | 0 | +----+---------+---------+ | 1 | 1 | 0 | +----+---------+---------+ When `values` is a Series, each column is checked if values is present positionally. This means that for `True` to be returned, the indexes must be the same. >>> i = ak.Index(ak.arange(2)) >>> s = ak.Series(data=[3, 9], index=i) >>> df.isin(s) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | 0 | 0 | +----+---------+---------+ | 1 | 0 | 1 | +----+---------+---------+ When `values` is a DataFrame, the index and column must match. Note that 9 is not found because the column name does not match. >>> other_df = ak.DataFrame({'col_A':ak.array([7, 3]), 'col_C':ak.array([0, 9])}) >>> df.isin(other_df) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | 1 | 0 | +----+---------+---------+ | 1 | 1 | 0 | +----+---------+---------+ """ if isinstance(values, pdarray): # flatten the DataFrame so single in1d can be used. flat_in1d = in1d(concatenate(list(self.data.values())), values) segs = concatenate( [ array([0]), cumsum(array([self.data[col].size for col in self.columns.values])), ] ) df_def = {col: flat_in1d[segs[i] : segs[i + 1]] for i, col in enumerate(self.columns.values)} elif isinstance(values, Dict): # key is column name, val is the list of values to check df_def = { col: ( in1d(self.data[col], values[col]) if col in values.keys() else zeros(self._nrows, dtype=akbool) ) for col in self.columns.values } elif isinstance(values, DataFrame) or ( isinstance(values, Series) and isinstance(values.index, Index) ): # create the dataframe with all false df_def = {col: zeros(self._nrows, dtype=akbool) for col in self.columns.values} # identify the indexes in both rows_self, rows_val = intersect(self.index.index, values.index.index, unique=True) # used to sort the rows with only the indexes in both sort_self = self.index[rows_self].argsort() sort_val = values.index[rows_val].argsort() # update values in columns that exist in both. only update the rows whose indexes match for col in self.columns.values: if isinstance(values, DataFrame) and col in values.columns: df_def[col][rows_self] = ( self.data[col][rows_self][sort_self] == values.data[col][rows_val][sort_val] ) elif isinstance(values, Series): df_def[col][rows_self] = ( self.data[col][rows_self][sort_self] == values.values[rows_val][sort_val] ) else: # pandas provides the same error in this case raise ValueError("Cannot compute isin with duplicate axis.") return DataFrame(df_def, index=self.index)
[docs] def count(self, axis: Union[int, str] = 0, numeric_only=False) -> Series: """ Count non-NA cells for each column or row. The values np.NaN are considered NA. Parameters __________ axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are generated for each row. numeric_only: bool = False Include only float, int or boolean data. Returns _______ arkouda.series.Series For each column/row the number of non-NA/null entries. Raises ------ ValueError Raised if axis is not 0, 1, 'index', or 'columns'. See Also ________ GroupBy.count() Examples ________ >>> import arkouda as ak >>> ak.connect() >>> import numpy as np >>> df = ak.DataFrame({'col_A': ak.array([7, np.nan]), 'col_B':ak.array([1, 9])}) >>> display(df) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | 7 | 1 | +----+---------+---------+ | 1 | nan | 9 | +----+---------+---------+ >>> df.count() col_A 1 col_B 2 dtype: int64 >>> df = ak.DataFrame({'col_A': ak.array(["a","b","c"]), 'col_B':ak.array([1, np.nan, np.nan])}) >>> display(df) +----+---------+---------+ | | col_A | col_B | +====+=========+=========+ | 0 | a | 1 | +----+---------+---------+ | 1 | b | nan | +----+---------+---------+ | 2 | c | nan | +----+---------+---------+ >>> df.count() col_A 3 col_B 1 dtype: int64 >>> df.count(numeric_only=True) col_B 1 dtype: int64 >>> df.count(axis=1) 0 2 1 1 2 1 dtype: int64 """ from arkouda import full, isnan from arkouda.util import is_numeric if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): index_values_list = [] count_values_list = [] for col in self.columns: if is_numeric(self[col]): index_values_list.append(col) count_values_list.append((~isnan(self[col])).sum()) elif not numeric_only or self[col].dtype == bool: index_values_list.append(col) # Non-numeric columns do not have NaN values. count_values_list.append(self[col].size) return Series(array(count_values_list), index=Index(array(index_values_list))) elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): first = True count_values = arange(0) for col in self.columns: if is_numeric(self[col]): if first: count_values = akcast(~isnan(self[col]), dt="int64") first = False else: count_values += ~isnan(self[col]) elif not numeric_only or self[col].dtype == bool: if first: count_values = full(self.index.size, 1, dtype=akint64) first = False else: count_values += 1 if first: count_values = full(self.index.size, 0, dtype=akint64) if self.index is not None: idx = self.index[:] return Series(array(count_values), index=idx) else: return Series(array(count_values)) else: raise ValueError(f"No axis named {axis} for object type DataFrame")
[docs] def corr(self) -> DataFrame: """ Return new DataFrame with pairwise correlation of columns. Returns ------- arkouda.dataframe.DataFrame Arkouda DataFrame containing correlation matrix of all columns. Raises ------ RuntimeError Raised if there's a server-side error thrown. See Also -------- pdarray.corr Notes ----- Generates the correlation matrix using Pearson R for all columns. Attempts to convert to numeric values where possible for inclusion in the matrix. Example ------- >>> df = ak.DataFrame({'col1': [1, 2], 'col2': [-1, -2]}) >>> display(df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 1 | -1 | +----+--------+--------+ | 1 | 2 | -2 | +----+--------+--------+ >>> corr = df.corr() +------+--------+--------+ | | col1 | col2 | +======+========+========+ | col1 | 1 | -1 | +------+--------+--------+ | col2 | -1 | 1 | +------+--------+--------+ """ def numeric_help(d): if isinstance(d, Strings): d = Categorical(d) return d if isinstance(d, pdarray) else d.codes corrs = {} for c1 in self.columns.values: corrs[c1] = np.zeros(len(self.columns.values)) for i, c2 in enumerate(self.columns.values): if c1 == c2: corrs[c1][i] = 1 else: corrs[c1][i] = numeric_help(self[c1]).corr(numeric_help(self[c2])) return DataFrame({c: array(v) for c, v in corrs.items()}, index=array(self.columns.values))
[docs] @typechecked def merge( self, right: DataFrame, on: Optional[Union[str, List[str]]] = None, how: str = "inner", left_suffix: str = "_x", right_suffix: str = "_y", convert_ints: bool = True, sort: bool = True, ) -> DataFrame: r""" Merge Arkouda DataFrames with a database-style join. The resulting dataframe contains rows from both DataFrames as specified by the merge condition (based on the "how" and "on" parameters). Based on pandas merge functionality. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html Parameters ---------- right: DataFrame The Right DataFrame to be joined. on: Optional[Union[str, List[str]]] = None The name or list of names of the DataFrame column(s) to join on. If on is None, this defaults to the intersection of the columns in both DataFrames. how: {"inner", "left", "right}, default = "inner" The merge condition. Must be "inner", "left", or "right". left_suffix: str, default = "_x" A string indicating the suffix to add to columns from the left dataframe for overlapping column names in both left and right. Defaults to "_x". Only used when how is "inner". right_suffix: str, default = "_y" A string indicating the suffix to add to columns from the right dataframe for overlapping column names in both left and right. Defaults to "_y". Only used when how is "inner". convert_ints: bool = True If True, convert columns with missing int values (due to the join) to float64. This is to match pandas. If False, do not convert the column dtypes. This has no effect when how = "inner". sort: bool = True If True, DataFrame is returned sorted by "on". Otherwise, the DataFrame is not sorted. Returns ------- arkouda.dataframe.DataFrame Joined Arkouda DataFrame. Note ---- Multiple column joins are only supported for integer columns. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> left_df = ak.DataFrame({'col1': ak.arange(5), 'col2': -1 * ak.arange(5)}) >>> display(left_df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 1 | -1 | +----+--------+--------+ | 2 | 2 | -2 | +----+--------+--------+ | 3 | 3 | -3 | +----+--------+--------+ | 4 | 4 | -4 | +----+--------+--------+ >>> right_df = ak.DataFrame({'col1': 2 * ak.arange(5), 'col2': 2 * ak.arange(5)}) >>> display(right_df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 2 | 2 | +----+--------+--------+ | 2 | 4 | 4 | +----+--------+--------+ | 3 | 6 | 6 | +----+--------+--------+ | 4 | 8 | 8 | +----+--------+--------+ >>> left_df.merge(right_df, on = "col1") +----+--------+----------+----------+ | | col1 | col2_x | col2_y | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 2 | -2 | 2 | +----+--------+----------+----------+ | 2 | 4 | -4 | 4 | +----+--------+----------+----------+ >>> left_df.merge(right_df, on = "col1", how = "left") +----+--------+----------+----------+ | | col1 | col2_y | col2_x | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 1 | nan | -1 | +----+--------+----------+----------+ | 2 | 2 | 2 | -2 | +----+--------+----------+----------+ | 3 | 3 | nan | -3 | +----+--------+----------+----------+ | 4 | 4 | 4 | -4 | +----+--------+----------+----------+ >>> left_df.merge(right_df, on = "col1", how = "right") +----+--------+----------+----------+ | | col1 | col2_x | col2_y | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 2 | -2 | 2 | +----+--------+----------+----------+ | 2 | 4 | -4 | 4 | +----+--------+----------+----------+ | 3 | 6 | nan | 6 | +----+--------+----------+----------+ | 4 | 8 | nan | 8 | +----+--------+----------+----------+ >>> left_df.merge(right_df, on = "col1", how = "outer") +----+--------+----------+----------+ | | col1 | col2_y | col2_x | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 1 | nan | -1 | +----+--------+----------+----------+ | 2 | 2 | 2 | -2 | +----+--------+----------+----------+ | 3 | 3 | nan | -3 | +----+--------+----------+----------+ | 4 | 4 | 4 | -4 | +----+--------+----------+----------+ | 5 | 6 | 6 | nan | +----+--------+----------+----------+ | 6 | 8 | 8 | nan | +----+--------+----------+----------+ """ return merge( self, right, on, how, left_suffix, right_suffix, convert_ints=convert_ints, sort=sort )
[docs] @typechecked def isna(self) -> DataFrame: """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. numpy.NaN values get mapped to True values. Everything else gets mapped to False values. Returns ------- arkouda.dataframe.DataFrame Mask of bool values for each element in DataFrame that indicates whether an element is an NA value. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import numpy as np >>> df = ak.DataFrame({"A": [np.nan, 2, 2, 3], "B": [3, np.nan, 5, 6], ... "C": [1, np.nan, 2, np.nan], "D":["a","b","c","d"]}) >>> display(df) +----+-----+-----+-----+-----+ | | A | B | C | D | +====+=====+=====+=====+=====+ | 0 | nan | 3 | 1 | a | +----+-----+-----+-----+-----+ | 1 | 2 | nan | nan | b | +----+-----+-----+-----+-----+ | 2 | 2 | 5 | 2 | c | +----+-----+-----+-----+-----+ | 3 | 3 | 6 | nan | d | +----+-----+-----+-----+-----+ >>> df.isna() A B C D 0 True False False False 1 False True True False 2 False False False False 3 False False True False (4 rows x 4 columns) """ from arkouda import full, isnan from arkouda.util import is_numeric def is_nan_col(col: str): if is_numeric(self[col]): return isnan(self[col]) else: return full(self.shape[0], False, dtype=akbool) data = {col: is_nan_col(col) for col in self.columns.values} return DataFrame(data)
[docs] @typechecked def notna(self) -> DataFrame: """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. numpy.NaN values get mapped to False values. Returns ------- arkouda.dataframe.DataFrame Mask of bool values for each element in DataFrame that indicates whether an element is not an NA value. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import numpy as np >>> df = ak.DataFrame({"A": [np.nan, 2, 2, 3], "B": [3, np.nan, 5, 6], ... "C": [1, np.nan, 2, np.nan], "D":["a","b","c","d"]}) >>> display(df) +----+-----+-----+-----+-----+ | | A | B | C | D | +====+=====+=====+=====+=====+ | 0 | nan | 3 | 1 | a | +----+-----+-----+-----+-----+ | 1 | 2 | nan | nan | b | +----+-----+-----+-----+-----+ | 2 | 2 | 5 | 2 | c | +----+-----+-----+-----+-----+ | 3 | 3 | 6 | nan | d | +----+-----+-----+-----+-----+ >>> df.notna() A B C D 0 False True True True 1 True False False True 2 True True True True 3 True True False True (4 rows x 4 columns) """ from arkouda import full, isnan from arkouda.util import is_numeric def not_nan_col(col: str): if is_numeric(self[col]): return ~isnan(self[col]) else: return full(self.shape[0], True, dtype=akbool) data = {col: not_nan_col(col) for col in self.columns.values} return DataFrame(data)
[docs] @typechecked def any(self, axis=0) -> Union[Series, bool]: """ Return whether any element is True, potentially over an axis. Returns False unless there is at least one element along a Dataframe axis that is True. Currently, will ignore any columns that are not type bool. This is equivalent to the pandas option bool_only=True. Parameters ---------- axis: {0 or ‘index’, 1 or ‘columns’, None}, default = 0 Indicate which axis or axes should be reduced. 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels. 1 / ‘columns’ : reduce the columns, return a Series whose index is the original index. None : reduce all axes, return a scalar. Returns ------- arkouda.series.Series or bool Raises ------ ValueError Raised if axis does not have a value in {0 or ‘index’, 1 or ‘columns’, None}. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"A":[True,True,True,False],"B":[True,True,True,False], ... "C":[True,False,True,False],"D":[False,False,False,False]}) +----+---------+---------+---------+---------+ | | A | B | C | D | +====+=========+=========+=========+=========+ | 0 | True | True | True | False | +----+---------+---------+---------+---------+ | 1 | True | True | False | False | +----+---------+---------+---------+---------+ | 2 | True | True | True | False | +----+---------+---------+---------+---------+ | 3 | False | False | False | False | +----+---------+---------+---------+---------+ >>> df.any(axis=0) A True B True C True D False dtype: bool >>> df.any(axis=1) 0 True 1 True 2 True 3 False dtype: bool >>> df.any(axis=None) True """ from arkouda import any as akany from arkouda import array, full if self.empty: if axis is None: return False else: return Series(array([], dtype=bool)) bool_cols = [col for col in self.columns.values if self.dtypes[col] == "bool"] if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): return Series( array([akany(self[col]) for col in bool_cols]), index=Index(bool_cols), ) elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): mask = None first = True for col in bool_cols: if first: mask = self[col] first = False else: mask |= self[col] if first: mask = full(self.shape[0], False, dtype=bool) return Series(mask, index=self.index.values[:]) elif axis is None: return any([akany(self[col]) for col in bool_cols]) else: raise ValueError("axis must have value 0, 1, 'index', 'columns', or None.")
[docs] @typechecked def all(self, axis=0) -> Union[Series, bool]: """ Return whether all elements are True, potentially over an axis. Returns True unless there at least one element along a Dataframe axis that is False. Currently, will ignore any columns that are not type bool. This is equivalent to the pandas option bool_only=True. Parameters ---------- axis: {0 or ‘index’, 1 or ‘columns’, None}, default = 0 Indicate which axis or axes should be reduced. 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels. 1 / ‘columns’ : reduce the columns, return a Series whose index is the original index. None : reduce all axes, return a scalar. Returns ------- arkouda.series.Series or bool Raises ------ ValueError Raised if axis does not have a value in {0 or ‘index’, 1 or ‘columns’, None}. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> df = ak.DataFrame({"A":[True,True,True,False],"B":[True,True,True,False], ... "C":[True,False,True,False],"D":[True,True,True,True]}) +----+---------+---------+---------+--------+ | | A | B | C | D | +====+=========+=========+=========+========+ | 0 | True | True | True | True | +----+---------+---------+---------+--------+ | 1 | True | True | False | True | +----+---------+---------+---------+--------+ | 2 | True | True | True | True | +----+---------+---------+---------+--------+ | 3 | False | False | False | True | +----+---------+---------+---------+--------+ >>> df.all(axis=0) A False B False C False D True dtype: bool >>> df.all(axis=1) 0 True 1 False 2 True 3 False dtype: bool >>> df.all(axis=None) False """ from arkouda import all as akall from arkouda import array, full if self.empty: if axis is None: return True else: return Series(array([], dtype=bool)) bool_cols = [col for col in self.columns.values if self.dtypes[col] == "bool"] if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): return Series( array([akall(self[col]) for col in bool_cols]), index=Index(bool_cols), ) elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): mask = None first = True for col in bool_cols: if first: mask = self[col] first = False else: mask &= self[col] if first: mask = full(self.shape[0], True, dtype=bool) return Series(mask, index=self.index.values[:]) elif axis is None: return all([akall(self[col]) for col in bool_cols]) else: raise ValueError("axis must have value 0, 1, 'index', 'columns', or None.")
[docs] @typechecked def dropna( self, axis: Union[int, str] = 0, how: Optional[str] = None, thresh: Optional[int] = None, ignore_index: bool = False, ) -> DataFrame: """ Remove missing values. Parameters ---------- axis: {0 or 'index', 1 or 'columns'}, default = 0 Determine if rows or columns which contain missing values are removed. 0, or 'index': Drop rows which contain missing values. 1, or 'columns': Drop columns which contain missing value. Only a single axis is allowed. how: {'any', 'all'}, default='any' Determine if row or column is removed from DataFrame, when we have at least one NA or all NA. 'any': If any NA values are present, drop that row or column. 'all': If all values are NA, drop that row or column. thresh: int, optional Require that many non - NA values.Cannot be combined with how. ignore_index: bool, default ``False`` If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- arkouda.dataframe.DataFrame DataFrame with NA entries dropped from it. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> import numpy as np >>> df = ak.DataFrame( { "A": [True, True, True, True], "B": [1, np.nan, 2, np.nan], "C": [1, 2, 3, np.nan], "D": [False, False, False, False], "E": [1, 2, 3, 4], "F": ["a", "b", "c", "d"], "G": [1, 2, 3, 4], } ) >>> display(df) +----+------+-----+-----+-------+-----+-----+-----+ | | A | B | C | D | E | F | G | +====+======+=====+=====+=======+=====+=====+=====+ | 0 | True | 1 | 1 | False | 1 | a | 1 | +----+------+-----+-----+-------+-----+-----+-----+ | 1 | True | nan | 2 | False | 2 | b | 2 | +----+------+-----+-----+-------+-----+-----+-----+ | 2 | True | 2 | 3 | False | 3 | c | 3 | +----+------+-----+-----+-------+-----+-----+-----+ | 3 | True | nan | nan | False | 4 | d | 4 | +----+------+-----+-----+-------+-----+-----+-----+ >>> df.dropna() +----+------+-----+-----+-------+-----+-----+-----+ | | A | B | C | D | E | F | G | +====+======+=====+=====+=======+=====+=====+=====+ | 0 | True | 1 | 1 | False | 1 | a | 1 | +----+------+-----+-----+-------+-----+-----+-----+ | 1 | True | 2 | 3 | False | 3 | c | 3 | +----+------+-----+-----+-------+-----+-----+-----+ >>> df.dropna(axis=1) +----+------+-------+-----+-----+-----+ | | A | D | E | F | G | +====+======+=======+=====+=====+=====+ | 0 | True | False | 1 | a | 1 | +----+------+-------+-----+-----+-----+ | 1 | True | False | 2 | b | 2 | +----+------+-------+-----+-----+-----+ | 2 | True | False | 3 | c | 3 | +----+------+-------+-----+-----+-----+ | 3 | True | False | 4 | d | 4 | +----+------+-------+-----+-----+-----+ >>> df.dropna(axis=1, thresh=3) +----+------+-----+-------+-----+-----+-----+ | | A | C | D | E | F | G | +====+======+=====+=======+=====+=====+=====+ | 0 | True | 1 | False | 1 | a | 1 | +----+------+-----+-------+-----+-----+-----+ | 1 | True | 2 | False | 2 | b | 2 | +----+------+-----+-------+-----+-----+-----+ | 2 | True | 3 | False | 3 | c | 3 | +----+------+-----+-------+-----+-----+-----+ | 3 | True | nan | False | 4 | d | 4 | +----+------+-----+-------+-----+-----+-----+ >>> df.dropna(axis=1, how="all") +----+------+-----+-----+-------+-----+-----+-----+ | | A | B | C | D | E | F | G | +====+======+=====+=====+=======+=====+=====+=====+ | 0 | True | 1 | 1 | False | 1 | a | 1 | +----+------+-----+-----+-------+-----+-----+-----+ | 1 | True | nan | 2 | False | 2 | b | 2 | +----+------+-----+-----+-------+-----+-----+-----+ | 2 | True | 2 | 3 | False | 3 | c | 3 | +----+------+-----+-----+-------+-----+-----+-----+ | 3 | True | nan | nan | False | 4 | d | 4 | +----+------+-----+-----+-------+-----+-----+-----+ """ from arkouda import all as akall if (how is not None) and (thresh is not None): raise TypeError("You cannot set both the how and thresh arguments at the same time.") if how is None: how = "any" if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): agg_axis = 1 elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): agg_axis = 0 if thresh is not None: counts = self.count(axis=agg_axis) mask = counts >= thresh # type: ignore elif how == "any": mask = self.notna().all(axis=agg_axis) elif how == "all": mask = self.notna().any(axis=agg_axis) else: raise ValueError(f"invalid how option: {how}") if (isinstance(mask, bool) and mask is True) or ( isinstance(mask, Series) and akall(mask.values) is True ): result = self.copy(deep=None) else: if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): if self.empty is True: result = DataFrame() else: result = self[mask].copy(deep=True) elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): result = DataFrame() if isinstance(mask, Series): for col, truth in zip(mask.index.values.to_list(), mask.values.to_list()): if truth is True: result[col] = self[col][:] if ignore_index is True and result.empty is False: result = result.reset_index() return result
[docs] @typechecked def register(self, user_defined_name: str) -> DataFrame: """ Register this DataFrame object and underlying components with the Arkouda server. Parameters ---------- user_defined_name : str User defined name the DataFrame is to be registered under. This will be the root name for underlying components. Returns ------- arkouda.dataframe.DataFrame The same DataFrame which is now registered with the arkouda server and has an updated name. This is an in-place modification, the original is returned to support a fluid programming style. Please note you cannot register two different DataFrames with the same name. Raises ------ TypeError Raised if user_defined_name is not a str. RegistrationError If the server was unable to register the DataFrame with the user_defined_name. See also -------- unregister attach unregister_dataframe_by_name is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. Any changes made to a DataFrame object after registering with the server may not be reflected in attached copies. Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df.register("my_table_name") >>> df.attach("my_table_name") >>> df.is_registered() True >>> df.unregister() >>> df.is_registered() False """ from arkouda.categorical import Categorical as Categorical_ if self.registered_name is not None and self.is_registered(): raise RegistrationError(f"This object is already registered as {self.registered_name}") column_data = [ ( obj.name if not isinstance(obj, (Categorical_, SegArray, BitVector)) else ( json.dumps( { "codes": obj.codes.name, "categories": obj.categories.name, "NA_codes": obj._akNAcode.name, **( {"permutation": obj.permutation.name} if obj.permutation is not None else {} ), **({"segments": obj.segments.name} if obj.segments is not None else {}), } ) if isinstance(obj, Categorical_) else ( json.dumps({"segments": obj.segments.name, "values": obj.values.name}) if isinstance(obj, SegArray) else json.dumps( { "name": obj.name, "width": obj.width, "reverse": obj.reverse, } # BitVector Case ) ) ) ) for obj in self.values() ] col_objTypes = [ obj.special_objType if hasattr(obj, "special_objType") else obj.objType for obj in self.values() ] generic_msg( cmd="register", args={ "name": user_defined_name, "objType": self.objType, "idx": self.index.values.name, "num_cols": len(self.columns.values), "column_names": self.columns.values, "columns": column_data, "col_objTypes": col_objTypes, }, ) self.registered_name = user_defined_name return self
[docs] def unregister(self): """ Unregister this DataFrame object in the arkouda server which was previously registered using register() and/or attached to using attach(). Raises ------ RegistrationError If the object is already unregistered or if there is a server error when attempting to unregister. See also -------- register attach unregister_dataframe_by_name is_registered Notes ----- Objects registered with the server are immune to deletion until they are unregistered. Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df.register("my_table_name") >>> df.attach("my_table_name") >>> df.is_registered() True >>> df.unregister() >>> df.is_registered() False """ from arkouda.util import unregister if not self.registered_name: raise RegistrationError("This object is not registered") unregister(self.registered_name) self.registered_name = None # Clear our internal DataFrame object name
[docs] def is_registered(self) -> bool: """ Return True if the object is contained in the registry. Returns ------- bool Indicates if the object is contained in the registry. Raises ------ RegistrationError Raised if there's a server-side error or a mismatch of registered components. See Also -------- register attach unregister unregister_dataframe_by_name Notes ----- Objects registered with the server are immune to deletion until they are unregistered. Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df.register("my_table_name") >>> df.attach("my_table_name") >>> df.is_registered() True >>> df.unregister() >>> df.is_registered() False """ from arkouda.util import is_registered if self.registered_name is None: return False # Dataframe cannot be registered as a component return is_registered(self.registered_name)
[docs] @staticmethod def attach(user_defined_name: str) -> DataFrame: """ Function to return a DataFrame object attached to the registered name in the arkouda server which was registered using register(). Parameters ---------- user_defined_name : str user defined name which DataFrame object was registered under. Returns ------- arkouda.dataframe.DataFrame The DataFrame object created by re-attaching to the corresponding server components. Raises ------ RegistrationError if user_defined_name is not registered See Also -------- register, is_registered, unregister Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df.register("my_table_name") >>> df.attach("my_table_name") >>> df.is_registered() True >>> df.unregister() >>> df.is_registered() False """ import warnings from arkouda.util import attach warnings.warn( "ak.DataFrame.attach() is deprecated. Please use ak.attach() instead.", DeprecationWarning, ) return attach(user_defined_name)
[docs] @staticmethod @typechecked def unregister_dataframe_by_name(user_defined_name: str) -> str: """ Function to unregister DataFrame object by name which was registered with the arkouda server via register(). Parameters ---------- user_defined_name : str Name under which the DataFrame object was registered. Raises ------- TypeError If user_defined_name is not a string. RegistrationError If there is an issue attempting to unregister any underlying components. See Also -------- register unregister attach is_registered Example ------- >>> df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> df.register("my_table_name") >>> df.attach("my_table_name") >>> df.is_registered() True >>> df.unregister_dataframe_by_name("my_table_name") >>> df.is_registered() False """ import warnings from arkouda.util import unregister warnings.warn( "ak.DataFrame.unregister_dataframe_by_name() is deprecated. " "Please use ak.unregister() instead.", DeprecationWarning, ) return unregister(user_defined_name)
@staticmethod def _parse_col_name(entryName, dfName): """ Helper method used by from_return_msg to parse the registered name of the data component and pull out the column type and column name Parameters ---------- entryName : string The full registered name of the data component dfName : string The name of the DataFrame Returns ------- tuple (columnName, columnType) """ nameParts = entryName.split(" ") regName = nameParts[1] if len(nameParts) > 1 else nameParts[0] colParts = regName.split("_") colType = colParts[2] # Case of '_' in the column or dataframe name if len(colParts) > 5: nameInd = regName.rindex(dfName) - 1 startInd = len(colType) + 9 return regName[startInd:nameInd], colType else: return colParts[3], colType
[docs] @classmethod def from_return_msg(cls, rep_msg): """ Creates a DataFrame object from an arkouda server response message. Parameters ---------- rep_msg : string Server response message used to create a DataFrame. Returns ------- arkouda.dataframe.DataFrame """ from arkouda.categorical import Categorical as Categorical_ data = json.loads(rep_msg) idx = None columns = {} for k, create_data in data.items(): comps = create_data.split("+|+") if k.lower() == "index": if comps[0] == Strings.objType.upper(): idx = Index(Strings.from_return_msg(comps[1])) else: idx = Index(create_pdarray(comps[1])) else: if comps[0] == pdarray.objType.upper(): columns[k] = create_pdarray(comps[1]) elif comps[0] == Strings.objType.upper(): columns[k] = Strings.from_return_msg(comps[1]) elif comps[0] == IPv4.special_objType.upper(): columns[k] = IPv4(create_pdarray(comps[1])) elif comps[0] == Datetime.special_objType.upper(): columns[k] = Datetime(create_pdarray(comps[1])) elif comps[0] == Timedelta.special_objType.upper(): columns[k] = Timedelta(create_pdarray(comps[1])) elif comps[0] == Categorical_.objType.upper(): columns[k] = Categorical_.from_return_msg(comps[1]) elif comps[0] == SegArray.objType.upper(): columns[k] = SegArray.from_return_msg(comps[1]) elif comps[0] == BitVector.special_objType.upper(): columns[k] = BitVector.from_return_msg(comps[1]) return cls(columns, idx)
[docs] def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. Returns a new object with all original columns in addition to new ones. Existing columns that are re-assigned will be overwritten. Parameters ---------- **kwargs : dict of {str: callable or Series} The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not change input DataFrame (though pandas doesn't check it). If the values are not callable, (e.g. a Series, scalar, or array), they are simply assigned. Returns ------- DataFrame A new DataFrame with the new columns in addition to all the existing columns. Notes ----- Assigning multiple columns within the same ``assign`` is possible. Later items in '\*\*kwargs' may refer to newly created or modified columns in 'df'; items are computed and assigned into 'df' in order. Examples -------- >>> df = ak.DataFrame({'temp_c': [17.0, 25.0]}, ... index=['Portland', 'Berkeley']) >>> df temp_c Portland 17.0 Berkeley 25.0 Where the value is a callable, evaluated on `df`: >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 Alternatively, the same behavior can be achieved by directly referencing an existing Series or sequence: >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) temp_c temp_f Portland 17.0 62.6 Berkeley 25.0 77.0 You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) temp_c temp_f temp_k Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 """ data = self.copy(deep=None) for k, v in kwargs.items(): data[k] = apply_if_callable(v, data) return data
[docs] def intx(a, b): """ Find all the rows that are in both dataframes. Columns should be in identical order. Note: does not work for columns of floating point values, but does work for Strings, pdarrays of int64 type, and Categorical *should* work. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> a = ak.DataFrame({'a':ak.arange(5),'b': 2* ak.arange(5)}) >>> display(a) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | 2 | +----+-----+-----+ | 2 | 2 | 4 | +----+-----+-----+ | 3 | 3 | 6 | +----+-----+-----+ | 4 | 4 | 8 | +----+-----+-----+ >>> b = ak.DataFrame({'a':ak.arange(5),'b':ak.array([0,3,4,7,8])}) >>> display(b) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 1 | 3 | +----+-----+-----+ | 2 | 2 | 4 | +----+-----+-----+ | 3 | 3 | 7 | +----+-----+-----+ | 4 | 4 | 8 | +----+-----+-----+ >>> intx(a,b) >>> intersect_df = a[intx(a,b)] >>> display(intersect_df) +----+-----+-----+ | | a | b | +====+=====+=====+ | 0 | 0 | 0 | +----+-----+-----+ | 1 | 2 | 4 | +----+-----+-----+ | 2 | 4 | 8 | +----+-----+-----+ """ if list(a.data) == list(b.data): a_cols = [] b_cols = [] for key, val in a.items(): if key != "index": a_cols.append(val) for key, val in b.items(): if key != "index": b_cols.append(val) return in1d(a_cols, b_cols) else: raise ValueError("Column mismatch.")
[docs] def intersect(a, b, positions=True, unique=False): """ Find the intersection of two arkouda arrays. This function can be especially useful when `positions=True` so that the caller gets the indices of values present in both arrays. Parameters ---------- a : Strings or pdarray An array of strings. b : Strings or pdarray An array of strings. positions : bool, default=True Return tuple of boolean pdarrays that indicate positions in `a` and `b` of the intersection values. unique : bool, default=False If the number of distinct values in `a` (and `b`) is equal to the size of `a` (and `b`), there is a more efficient method to compute the intersection. Returns ------- (arkouda.pdarrayclass.pdarray, arkouda.pdarrayclass.pdarray) or arkouda.pdarrayclass.pdarray The indices of `a` and `b` where any element occurs at least once in both arrays. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> a = ak.arange(10) >>> print(a) [0 1 2 3 4 5 6 7 8 9] >>> b = 2 * ak.arange(10) >>> print(b) [0 2 4 6 8 10 12 14 16 18] >>> intersect(a,b, positions=True) (array([True False True False True False True False True False]), array([True True True True True False False False False False])) >>> intersect(a,b, positions=False) array([0 2 4 6 8]) """ # To ensure compatibility with all types of arrays: if isinstance(a, pdarray) and isinstance(b, pdarray): intx = intersect1d(a, b) if not positions: return intx else: maska = in1d(a, intx) maskb = in1d(b, intx) return (maska, maskb) # It takes more effort to do this with ak.Strings arrays. elif isinstance(a, Strings) and isinstance(b, Strings): # Hash the two arrays first hash_a00, hash_a01 = a.hash() hash_b00, hash_b01 = b.hash() # a and b do not have duplicate entries, so the hashes are distinct if unique: hash0 = concatenate([hash_a00, hash_b00]) hash1 = concatenate([hash_a01, hash_b01]) # Group by the unique hashes gb = akGroupBy([hash0, hash1]) val, cnt = gb.count() # Hash counts, in groupby order counts = gb.broadcast(cnt, permute=False) # Same, in original order tmp = counts[:] counts[gb.permutation] = tmp del tmp # Masks maska = (counts > 1)[: a.size] maskb = (counts > 1)[a.size :] # The intersection for each array of hash values if positions: return (maska, maskb) else: return a[maska] # a and b may have duplicate entries, so get the unique hash values else: gba = akGroupBy([hash_a00, hash_a01]) gbb = akGroupBy([hash_b00, hash_b01]) # Take the unique keys as the hash we'll work with a0, a1 = gba.unique_keys b0, b1 = gbb.unique_keys hash0 = concatenate([a0, b0]) hash1 = concatenate([a1, b1]) # Group by the unique hashes gb = akGroupBy([hash0, hash1]) val, cnt = gb.count() # Hash counts, in groupby order counts = gb.broadcast(cnt, permute=False) # Restore the original order tmp = counts[:] counts[gb.permutation] = tmp del tmp # Broadcast back up one more level countsa = counts[: a0.size] countsb = counts[a0.size :] counts2a = gba.broadcast(countsa, permute=False) counts2b = gbb.broadcast(countsb, permute=False) # Restore the original orders tmp = counts2a[:] counts2a[gba.permutation] = tmp del tmp tmp = counts2b[:] counts2b[gbb.permutation] = tmp del tmp # Masks maska = counts2a > 1 maskb = counts2b > 1 # The intersection for each array of hash values if positions: return (maska, maskb) else: return a[maska]
[docs] def invert_permutation(perm): """ Find the inverse of a permutation array. Parameters ---------- perm : pdarray The permutation array. Returns ------- arkouda.pdarrayclass.pdarray The inverse of the permutation array. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> from arkouda.index import Index >>> i = Index(ak.array([1,2,0,5,4])) >>> perm = i.argsort() >>> print(perm) [2 0 1 4 3] >>> invert_permutation(perm) array([1 2 0 4 3]) """ # Test if the array is actually a permutation rng = perm.max() - perm.min() if (unique(perm).size != perm.size) and (perm.size != rng + 1): raise ValueError("The array is not a permutation.") return coargsort([perm, arange(perm.size)])
@typechecked def _inner_join_merge( left: DataFrame, right: DataFrame, on: Union[str, List[str]], col_intersect: Union[str, List[str]], left_suffix: str = "_x", right_suffix: str = "_y", sort: bool = True, ) -> DataFrame: """ Utilizes the ak.join.inner_join function to return an ak DataFrame object containing only rows that are in both the left and right Dataframes, (based on the "on" param), as well as their associated values. Parameters ---------- left: DataFrame The Left DataFrame to be joined right: DataFrame The Right DataFrame to be joined on: Optional[Union[str, List[str]]] = None The name or list of names of the DataFrame column(s) to join on. If on is None, this defaults to the intersection of the columns in both DataFrames. left_suffix: str = "_x" A string indicating the suffix to add to columns from the left dataframe for overlapping column names in both left and right. Defaults to "_x" right_suffix: str = "_y" A string indicating the suffix to add to columns from the right dataframe for overlapping column names in both left and right. Defaults to "_y" sort: bool = True If True, DataFrame is returned sorted by "on". Otherwise, the DataFrame is not sorted. Returns ------- arkouda.dataframe.DataFrame Inner-Joined Arkouda DataFrame """ left_cols, right_cols = left.columns.values.copy(), right.columns.values.copy() if isinstance(on, str): left_inds, right_inds = inner_join(left[on], right[on]) new_dict = {on: left[on][left_inds]} left_cols.remove(on) right_cols.remove(on) else: left_inds, right_inds = inner_join([left[col] for col in on], [right[col] for col in on]) new_dict = {col: left[col][left_inds] for col in on} for col in on: left_cols.remove(col) right_cols.remove(col) for col in left_cols: new_col = col + left_suffix if col in col_intersect else col new_dict[new_col] = left[col][left_inds] for col in right_cols: new_col = col + right_suffix if col in col_intersect else col new_dict[new_col] = right[col][right_inds] ret_df = DataFrame(new_dict) if sort is True: ret_df = ret_df.sort_values(on).reset_index() return ret_df def _right_join_merge( left: DataFrame, right: DataFrame, on: Union[str, List[str]], col_intersect: Union[str, List[str]], left_suffix: str = "_x", right_suffix: str = "_y", convert_ints: bool = True, sort: bool = True, ) -> DataFrame: """ Utilizes the ak.join.inner_join_merge function to return an ak DataFrame object containing all the rows in the right Dataframe, as well as corresponding rows in the left (based on the "on" param), and all of their associated values. Based on pandas merge functionality. Parameters ---------- left: DataFrame The Left DataFrame to be joined right: DataFrame The Right DataFrame to be joined on: Optional[Union[str, List[str]]] = None The name or list of names of the DataFrame column(s) to join on. If on is None, this defaults to the intersection of the columns in both DataFrames. left_suffix: str = "_x" A string indicating the suffix to add to columns from the left dataframe for overlapping column names in both left and right. Defaults to "_x" right_suffix: str = "_y" A string indicating the suffix to add to columns from the right dataframe for overlapping column names in both left and right. Defaults to "_y" convert_ints: bool = True If True, convert columns with missing int values (due to the join) to float64. This is to match pandas. If False, do not convert the column dtypes. sort: bool = True If True, DataFrame is returned sorted by "on". Otherwise, the DataFrame is not sorted. Returns ------- arkouda.dataframe.DataFrame Right-Joined Arkouda DataFrame """ in_left = _inner_join_merge(left, right, on, col_intersect, left_suffix, right_suffix, sort=False) in_left_cols, left_cols = in_left.columns.values.copy(), left.columns.values.copy() if isinstance(on, str): left_at_on = left[on] right_at_on = right[on] left_cols.remove(on) in_left_cols.remove(on) else: left_at_on = [left[col] for col in on] right_at_on = [right[col] for col in on] for col in on: left_cols.remove(col) in_left_cols.remove(col) not_in_left = right[in1d(right_at_on, left_at_on, invert=True)] for col in not_in_left.columns: if col in left_cols: not_in_left[col + right_suffix] = not_in_left[col] not_in_left = not_in_left.drop(col, axis=1) nan_cols = list(set(in_left) - set(not_in_left)) for col in nan_cols: if convert_ints is True and in_left[col].dtype == int: in_left[col] = akcast(in_left[col], akfloat64) # Create a nan array for all values not in the left df not_in_left[col] = __nulls_like(in_left[col], len(not_in_left)) ret_df = DataFrame.append(in_left, not_in_left) if sort is True: ret_df = ret_df.sort_values(on).reset_index() return ret_df def _outer_join_merge( left: DataFrame, right: DataFrame, on: Union[str, List[str]], col_intersect: Union[str, List[str]], left_suffix: str = "_x", right_suffix: str = "_y", convert_ints: bool = True, sort: bool = True, ) -> DataFrame: """ Utilizes the ak.join.inner_join_merge function to return an ak DataFrame object containing all the rows in each DataFrame (based on the "on" param), and all of their associated values. Based on pandas merge functionality. Parameters ---------- left: DataFrame The Left DataFrame to be joined right: DataFrame The Right DataFrame to be joined on: Optional[Union[str, List[str]]] = None The name or list of names of the DataFrame column(s) to join on. If on is None, this defaults to the intersection of the columns in both DataFrames. left_suffix: str = "_x" A string indicating the suffix to add to columns from the left dataframe for overlapping column names in both left and right. Defaults to "_x" right_suffix: str = "_y" A string indicating the suffix to add to columns from the right dataframe for overlapping column names in both left and right. Defaults to "_y" convert_ints: bool = True If True, convert columns with missing int values (due to the join) to float64. This is to match pandas. If False, do not convert the column dtypes. sort: bool = True If True, DataFrame is returned sorted by "on". Otherwise, the DataFrame is not sorted. Returns ------- arkouda.dataframe.DataFrame Outer-Joined Arkouda DataFrame """ inner = _inner_join_merge(left, right, on, col_intersect, left_suffix, right_suffix, sort=False) left_cols, right_cols = ( left.columns.values.copy(), right.columns.values.copy(), ) if isinstance(on, str): left_at_on = left[on] right_at_on = right[on] left_cols.remove(on) right_cols.remove(on) else: left_at_on = [left[col] for col in on] right_at_on = [right[col] for col in on] for col in on: left_cols.remove(col) right_cols.remove(col) not_in_left = right[in1d(right_at_on, left_at_on, invert=True)] for col in not_in_left.columns: if col in left_cols: not_in_left[col + right_suffix] = not_in_left[col] not_in_left = not_in_left.drop(col, axis=1) not_in_right = left[in1d(left_at_on, right_at_on, invert=True)] for col in not_in_right.columns: if col in right_cols: not_in_right[col + left_suffix] = not_in_right[col] not_in_right = not_in_right.drop(col, axis=1) left_nan_cols = list(set(inner) - set(not_in_left)) right_nan_cols = list(set(inner) - set(not_in_right)) for col in set(left_nan_cols).union(set(right_nan_cols)): if convert_ints is True and inner[col].dtype == int: inner[col] = akcast(inner[col], akfloat64) if col in left_nan_cols: if convert_ints is True and not_in_right[col].dtype == int: not_in_right[col] = akcast(not_in_right[col], akfloat64) elif col in not_in_left.columns.values: not_in_right[col] = akcast(not_in_right[col], not_in_left[col].dtype) if col in right_nan_cols: if convert_ints is True and not_in_left[col].dtype == int: not_in_left[col] = akcast(not_in_left[col], akfloat64) elif col in not_in_right.columns.values: not_in_left[col] = akcast(not_in_left[col], not_in_right[col].dtype) for col in left_nan_cols: # Create a nan array for all values not in the left df not_in_left[col] = __nulls_like(inner[col], len(not_in_left)) for col in right_nan_cols: # Create a nan array for all values not in the left df not_in_right[col] = __nulls_like(inner[col], len(not_in_right)) ret_df = DataFrame.append(DataFrame.append(inner, not_in_left), not_in_right) if sort is True: ret_df = ret_df.sort_values(on).reset_index() return ret_df def __nulls_like( arry: Union[pdarray, Strings, Categorical], size: Optional[ Union[ int, np.signedinteger[_8Bit], np.signedinteger[_16Bit], np.signedinteger[_32Bit], np.signedinteger[_64Bit], np.unsignedinteger[_8Bit], np.unsignedinteger[_16Bit], np.unsignedinteger[_32Bit], np.unsignedinteger[_64Bit], ] ] = None, ): if size is None: size = arry.size if isinstance(arry, (Strings, Categorical)): return full(size, "nan") else: return full(size, np.nan, arry.dtype)
[docs] @typechecked def merge( left: DataFrame, right: DataFrame, on: Optional[Union[str, List[str]]] = None, how: str = "inner", left_suffix: str = "_x", right_suffix: str = "_y", convert_ints: bool = True, sort: bool = True, ) -> DataFrame: r""" Merge Arkouda DataFrames with a database-style join. The resulting dataframe contains rows from both DataFrames as specified by the merge condition (based on the "how" and "on" parameters). Based on pandas merge functionality. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html Parameters ---------- left: DataFrame The Left DataFrame to be joined. right: DataFrame The Right DataFrame to be joined. on: Optional[Union[str, List[str]]] = None The name or list of names of the DataFrame column(s) to join on. If on is None, this defaults to the intersection of the columns in both DataFrames. how: str, default = "inner" The merge condition. Must be one of "inner", "left", "right", or "outer". left_suffix: str, default = "_x" A string indicating the suffix to add to columns from the left dataframe for overlapping column names in both left and right. Defaults to "_x". Only used when how is "inner". right_suffix: str, default = "_y" A string indicating the suffix to add to columns from the right dataframe for overlapping column names in both left and right. Defaults to "_y". Only used when how is "inner". convert_ints: bool = True If True, convert columns with missing int values (due to the join) to float64. This is to match pandas. If False, do not convert the column dtypes. This has no effect when how = "inner". sort: bool = True If True, DataFrame is returned sorted by "on". Otherwise, the DataFrame is not sorted. Returns ------- arkouda.dataframe.DataFrame Joined Arkouda DataFrame. Note ---- Multiple column joins are only supported for integer columns. Examples -------- >>> import arkouda as ak >>> ak.connect() >>> from arkouda import merge >>> left_df = ak.DataFrame({'col1': ak.arange(5), 'col2': -1 * ak.arange(5)}) >>> display(left_df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 1 | -1 | +----+--------+--------+ | 2 | 2 | -2 | +----+--------+--------+ | 3 | 3 | -3 | +----+--------+--------+ | 4 | 4 | -4 | +----+--------+--------+ >>> right_df = ak.DataFrame({'col1': 2 * ak.arange(5), 'col2': 2 * ak.arange(5)}) >>> display(right_df) +----+--------+--------+ | | col1 | col2 | +====+========+========+ | 0 | 0 | 0 | +----+--------+--------+ | 1 | 2 | 2 | +----+--------+--------+ | 2 | 4 | 4 | +----+--------+--------+ | 3 | 6 | 6 | +----+--------+--------+ | 4 | 8 | 8 | +----+--------+--------+ >>> merge(left_df, right_df, on = "col1") +----+--------+----------+----------+ | | col1 | col2_x | col2_y | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 2 | -2 | 2 | +----+--------+----------+----------+ | 2 | 4 | -4 | 4 | +----+--------+----------+----------+ >>> merge(left_df, right_df, on = "col1", how = "left") +----+--------+----------+----------+ | | col1 | col2_y | col2_x | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 1 | nan | -1 | +----+--------+----------+----------+ | 2 | 2 | 2 | -2 | +----+--------+----------+----------+ | 3 | 3 | nan | -3 | +----+--------+----------+----------+ | 4 | 4 | 4 | -4 | +----+--------+----------+----------+ >>> merge(left_df, right_df, on = "col1", how = "right") +----+--------+----------+----------+ | | col1 | col2_x | col2_y | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 2 | -2 | 2 | +----+--------+----------+----------+ | 2 | 4 | -4 | 4 | +----+--------+----------+----------+ | 3 | 6 | nan | 6 | +----+--------+----------+----------+ | 4 | 8 | nan | 8 | +----+--------+----------+----------+ >>> merge(left_df, right_df, on = "col1", how = "outer") +----+--------+----------+----------+ | | col1 | col2_y | col2_x | +====+========+==========+==========+ | 0 | 0 | 0 | 0 | +----+--------+----------+----------+ | 1 | 1 | nan | -1 | +----+--------+----------+----------+ | 2 | 2 | 2 | -2 | +----+--------+----------+----------+ | 3 | 3 | nan | -3 | +----+--------+----------+----------+ | 4 | 4 | 4 | -4 | +----+--------+----------+----------+ | 5 | 6 | 6 | nan | +----+--------+----------+----------+ | 6 | 8 | 8 | nan | +----+--------+----------+----------+ """ col_intersect = list(set(left.columns) & set(right.columns)) on = on if on is not None else col_intersect if not isinstance(on, str): if not all( isinstance(left[col], (pdarray, Strings)) and isinstance(right[col], (pdarray, Strings)) for col in on ): raise ValueError("All columns of a multi-column merge must be pdarrays") if how == "inner": return _inner_join_merge(left, right, on, col_intersect, left_suffix, right_suffix, sort=sort) elif how == "right": return _right_join_merge( left, right, on, col_intersect, left_suffix, right_suffix, convert_ints=convert_ints, sort=sort, ) elif how == "left": return _right_join_merge( right, left, on, col_intersect, right_suffix, left_suffix, convert_ints=convert_ints, sort=sort, ) elif how == "outer": warn( "Outer joins should not be performed on large data sets as they may require " "prohibitive amounts of memory.", UserWarning, ) return _outer_join_merge( right, left, on, col_intersect, right_suffix, left_suffix, convert_ints=convert_ints, sort=sort, ) else: raise ValueError( f"Unexpected value of {how} for how. Must choose: 'inner', 'left', 'right' or 'outer'" )