Source code for arkouda.plotting

import math
import numpy as np
from matplotlib import pyplot as plt
from arkouda.dataframe import DataFrame
from arkouda.timeclass import Datetime, Timedelta, date_range, timedelta_range
from arkouda.pdarrayclass import skew
from arkouda.pdarraycreation import arange
from arkouda.numeric import histogram, isnan
from arkouda.groupbyclass import GroupBy


[docs] def plot_dist(b, h, log=True, xlabel=None, newfig=True): """ Plot the distribution and cumulative distribution of histogram Data Parameters ---------- b : np.ndarray Bin edges h : np.ndarray Histogram data log : bool use log to scale y xlabel: str Label for the x axis of the graph newfig: bool Generate a new figure or not Notes ----- This function does not return or display the plot. A user must have matplotlib imported in addition to arkouda to display plots. This could be updated to return the object or have a flag to show the resulting plots. See Examples Below. Examples -------- >>> import arkouda as ak >>> from matplotlib import pyplot as plt >>> b, h = ak.histogram(ak.arange(10), 3) >>> ak.plot_dist(b, h.to_ndarray()) >>> # to show the plot >>> plt.show() """ if newfig: plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) plt.plot(b, h, marker=".", linestyle="solid") if log: plt.yscale("log") if xlabel is not None: plt.gca().set_xlabel(xlabel, fontsize=14) plt.gca().set_title("distribution") plt.subplot(1, 2, 2) plt.plot(b, np.cumsum(h) / np.sum(h), marker=None, linestyle="solid") plt.gca().set_ylim((0, 1)) plt.gca().set_title("cumulative distribution") if xlabel is not None: plt.gca().set_xlabel(xlabel, fontsize=14)
[docs] def hist_all(ak_df: DataFrame, cols: list = []): """ Create a grid plot histogramming all numeric columns in ak dataframe Parameters ---------- ak_df : ak.DataFrame Full Arkouda DataFrame containing data to be visualized cols : list (Optional) A specified list of columns to be plotted Notes ----- This function displays the plot. Examples -------- >>> import arkouda as ak >>> from arkouda.plotting import hist_all >>> ak_df = ak.DataFrame({"a": ak.array(np.random.randn(100)), "b": ak.array(np.random.randn(100)), "c": ak.array(np.random.randn(100)), "d": ak.array(np.random.randn(100)) }) >>> hist_all(ak_df) """ if len(cols) == 0: cols = ak_df.columns num_rows = int(math.ceil(len(cols) ** 0.5)) num_cols = (len(cols) + num_rows - 1) // num_rows fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 10)) fig.tight_layout(pad=2.0) if isinstance(axes, plt.Axes): axes = np.array(axes).flatten() elif isinstance(axes, np.ndarray): axes = axes.flatten() else: axes = [axes] for col in cols: try: ax = axes[cols.index(col)] x = ak_df[col] if x.dtype == "float64": x = x[~isnan(x)] n = len(x) g1 = skew(x) except ValueError: GB_df = GroupBy(ak_df[col]) new_labels = arange(GB_df.unique_keys.size) newcol = GB_df.broadcast(new_labels) x = newcol[: ak_df.size] if x.dtype == "float64": x = x[~isnan(x)] n = len(x) g1 = skew(x) sigma_g1 = math.sqrt(6 * (n - 2) / ((n + 1) * (n + 3))) # Doane's Formula num_bins = int(1 + math.log2(n) + math.log2(1 + abs(g1) / sigma_g1)) # Compute histogram counts in arkouda h = histogram(x, num_bins) # Compute bins in numpy if isinstance(x, Datetime): # Matplotlib has trouble plotting np.datetime64 and np.timedelta64 bins = date_range(x.min(), x.max(), periods=num_bins).to_ndarray().astype("int") elif isinstance(x, Timedelta): bins = timedelta_range(x.min(), x.max(), periods=num_bins).to_ndarray().astype("int") else: bins = np.linspace(x.min(), x.max(), num_bins + 1)[:-1] ax.bar(bins, h[1].to_ndarray(), width=bins[1] - bins[0]) ax.set_title(col, size=8) if x.max() > 100 * x.min(): ax.set_yscale("log")