Source code for arkouda.join

from typing import Callable, Optional, Sequence, Tuple, Union, cast

import numpy as np  # type: ignore
from typeguard import typechecked

from arkouda.alignment import right_align
from arkouda.categorical import Categorical
from arkouda.client import generic_msg
from arkouda.dtypes import NUMBER_FORMAT_STRINGS
from arkouda.dtypes import int64 as akint64
from arkouda.dtypes import resolve_scalar_dtype
from arkouda.groupbyclass import GroupBy, broadcast
from arkouda.numeric import cumsum
from arkouda.pdarrayclass import create_pdarray, pdarray
from arkouda.pdarraycreation import arange, array, ones, zeros
from arkouda.pdarraysetops import concatenate, in1d
from arkouda.strings import Strings

__all__ = ["join_on_eq_with_dt", "gen_ranges", "compute_join_size"]

predicates = {"true_dt": 0, "abs_dt": 1, "pos_dt": 2}



[docs]
@typechecked
def join_on_eq_with_dt(
    a1: pdarray,
    a2: pdarray,
    t1: pdarray,
    t2: pdarray,
    dt: Union[int, np.int64],
    pred: str,
    result_limit: Union[int, np.int64] = 1000,
) -> Tuple[pdarray, pdarray]:
    """
    Performs an inner-join on equality between two integer arrays where
    the time-window predicate is also true

    Parameters
    ----------
    a1 : pdarray, int64
        pdarray to be joined
    a2 : pdarray, int64
        pdarray to be joined
    t1 : pdarray
        timestamps in millis corresponding to the a1 pdarray
    t2 : pdarray
        timestamps in millis corresponding to the a2 pdarray
    dt : Union[int,np.int64]
        time delta
    pred : str
        time window predicate
    result_limit : Union[int,np.int64]
        size limit for returned result

    Returns
    -------
    result_array_one : pdarray, int64
        a1 indices where a1 == a2
    result_array_one : pdarray, int64
        a2 indices where a2 == a1

    Raises
    ------
    TypeError
        Raised if a1, a2, t1, or t2 is not a pdarray, or if dt or
        result_limit is not an int
    ValueError
        if a1, a2, t1, or t2 dtype is not int64, pred is not
        'true_dt', 'abs_dt', or 'pos_dt', or result_limit is < 0
    """
    if not (a1.dtype == akint64):
        raise ValueError("a1 must be int64 dtype")

    if not (a2.dtype == akint64):
        raise ValueError("a2 must be int64 dtype")

    if not (t1.dtype == akint64):
        raise ValueError("t1 must be int64 dtype")

    if not (t2.dtype == akint64):
        raise ValueError("t2 must be int64 dtype")

    if not (pred in predicates.keys()):
        raise ValueError(f"pred must be one of {predicates.keys()}")

    if result_limit < 0:
        raise ValueError("the result_limit must 0 or greater")

    # format numbers for request message
    dttype = resolve_scalar_dtype(dt)
    dtstr = NUMBER_FORMAT_STRINGS[dttype].format(dt)
    predtype = resolve_scalar_dtype(predicates[pred])
    predstr = NUMBER_FORMAT_STRINGS[predtype].format(predicates[pred])
    result_limittype = resolve_scalar_dtype(result_limit)
    result_limitstr = NUMBER_FORMAT_STRINGS[result_limittype].format(result_limit)
    # groupby on a2
    g2 = GroupBy(a2)
    # pass result into server joinEqWithDT operation
    repMsg = generic_msg(
        cmd="joinEqWithDT",
        args={
            "a1": a1,
            "g2seg": cast(pdarray, g2.segments),  # type: ignore
            "g2keys": cast(pdarray, g2.unique_keys),  # type: ignore
            "g2perm": g2.permutation,
            "t1": t1,
            "t2": t2,
            "dt": dtstr,
            "pred": predstr,
            "resLimit": result_limitstr,
        },
    )
    # create pdarrays for results
    resIAttr, resJAttr = cast(str, repMsg).split("+")
    resI = create_pdarray(resIAttr)
    resJ = create_pdarray(resJAttr)
    return resI, resJ




[docs]
def gen_ranges(starts, ends, stride=1, return_lengths=False):
    """
    Generate a segmented array of variable-length, contiguous ranges between pairs of
    start- and end-points.

    Parameters
    ----------
    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range
    stride: int
        Difference between successive elements of each range
    return_lengths: bool, optional
        Whether or not to return the lengths of each segment. Default False.

    Returns
    -------
    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    lengths : pdarray, int64
        The lengths of each segment. Only returned if return_lengths=True.
    """
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same length")
    if starts.size == 0:
        return zeros(0, dtype=akint64), zeros(0, dtype=akint64)
    lengths = (ends - starts) // stride
    if not (lengths >= 0).all():
        raise ValueError("all ends must be greater than or equal to starts")
    non_empty = lengths != 0
    segs = cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ones(totlen, dtype=akint64)
    non_empty_starts = starts[non_empty]
    non_empty_lengths = lengths[non_empty]
    diffs = concatenate(
        (
            array([non_empty_starts[0]]),
            non_empty_starts[1:] - non_empty_starts[:-1] - (non_empty_lengths[:-1] - 1) * stride,
        )
    )
    slices[segs[non_empty]] = diffs

    sums = cumsum(slices)
    if return_lengths:
        return segs, sums, lengths
    else:
        return segs, sums




[docs]
@typechecked
def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]:
    """
    Compute the internal size of a hypothetical join between a and b. Returns
    both the number of elements and number of bytes required for the join.
    """
    bya = GroupBy(a)
    ua, asize = bya.count()
    byb = GroupBy(b)
    ub, bsize = byb.count()
    afact = asize[in1d(ua, ub)]
    bfact = bsize[in1d(ub, ua)]
    nelem = (afact * bfact).sum()
    nbytes = 3 * 8 * nelem
    return nelem, nbytes



@typechecked
def inner_join(
    left: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
    right: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
    wherefunc: Optional[Callable] = None,
    whereargs: Optional[
        Tuple[
            Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
            Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
        ]
    ] = None,
) -> Tuple[pdarray, pdarray]:
    """Perform inner join on values in <left> and <right>,
    using conditions defined by <wherefunc> evaluated on
    <whereargs>, returning indices of left-right pairs.

    Parameters
    ----------
    left : pdarray(int64), Strings, Categorical, or Sequence of pdarray
        The left values to join
    right : pdarray(int64), Strings, Categorical, or Sequence of pdarray
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray, Strings, Categorical, or Sequence of pdarray, optional
        The two arguments for wherefunc

    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition

    Notes
    -----
    The return values satisfy the following assertions

    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`

    """
    from inspect import signature

    is_sequence = isinstance(left, Sequence) and isinstance(right, Sequence)

    # Reduce processing to codes to prevent groupby on entire Categorical
    if isinstance(left, Categorical) and isinstance(right, Categorical):
        l, r = Categorical.standardize_categories([left, right])
        left, right = l.codes, r.codes

    if is_sequence:
        if len(left) != len(right):
            raise ValueError("Left must have same num arrays as right")
        left_size, right_size = left[0].size, right[0].size
        if not all(lf.size == left_size for lf in left) or not all(
            rt.size == right_size for rt in right
        ):
            raise ValueError("Multi-array arguments must have equal-length arrays")
    else:
        left_size, right_size = left.size, right.size  # type: ignore

    sample = np.min((left_size, right_size, 5))  # type: ignore
    if wherefunc is not None:
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if is_sequence:
            if len(whereargs[0]) != len(whereargs[1]):
                raise ValueError("Left must have same num arrays as right")
            first_wa_size, second_wa_size = whereargs[0][0].size, whereargs[1][0].size
            if not all(wa.size == first_wa_size for wa in whereargs[0]) or not all(
                wa.size == second_wa_size for wa in whereargs[1]
            ):
                raise ValueError("Multi-array arguments must have equal-length arrays")
        else:
            first_wa_size, second_wa_size = whereargs[0].size, whereargs[1].size  # type: ignore
        if first_wa_size != left_size:
            raise ValueError("Left whereargs must be same size as left join values")
        if second_wa_size != right_size:
            raise ValueError("Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e

    # Need dense 0-up right index, to filter out left not in right
    keep, (denseLeft, denseRight) = right_align(left, right)
    if keep.sum() == 0:
        # Intersection is empty
        return zeros(0, dtype=akint64), zeros(0, dtype=akint64)
    keep = arange(keep.size)[keep]
    # GroupBy right
    byRight = GroupBy(denseRight)
    # Get segment boundaries (starts, ends) of right for each left item
    rightSegs = concatenate((byRight.segments, array([denseRight.size])))
    starts = rightSegs[denseLeft]
    ends = rightSegs[denseLeft + 1]
    # gen_ranges for gather of right items
    fullSegs, ranges = gen_ranges(starts, ends)
    # Evaluate where clause
    if wherefunc is None:
        filtRanges = ranges
        filtSegs = fullSegs
        keep12 = keep
    else:
        if whereargs is not None:
            if not is_sequence:
                # Gather right whereargs
                rightWhere = whereargs[1][byRight.permutation][ranges]
                # Expand left whereargs
                keep_where = whereargs[0][keep]
                keep_where = keep_where.codes if isinstance(keep_where, Categorical) else keep_where
                leftWhere = broadcast(fullSegs, keep_where, ranges.size)
            else:
                # Gather right whereargs
                rightWhere = [wa[byRight.permutation][ranges] for wa in whereargs[1]]
                # Expand left whereargs
                keep_where = [wa[keep] for wa in whereargs[0]]
                leftWhere = [broadcast(fullSegs, kw, ranges.size) for kw in keep_where]
            # Evaluate wherefunc and filter ranges, recompute segments
            whereSatisfied = wherefunc(leftWhere, rightWhere)
            filtRanges = ranges[whereSatisfied]
            scan = cumsum(whereSatisfied) - whereSatisfied
            filtSegsWithZeros = scan[fullSegs]
            filtSegSizes = concatenate(
                (
                    filtSegsWithZeros[1:] - filtSegsWithZeros[:-1],
                    array([whereSatisfied.sum() - filtSegsWithZeros[-1]]),
                )
            )
            keep2 = filtSegSizes > 0
            filtSegs = filtSegsWithZeros[keep2]
            keep12 = keep[keep2]
    # Gather right inds and expand left inds
    rightInds = byRight.permutation[filtRanges]
    leftInds = broadcast(filtSegs, arange(left_size)[keep12], filtRanges.size)
    return leftInds, rightInds