from __future__ import annotations
import codecs
import itertools
import re
from typing import (
TYPE_CHECKING,
Dict,
List,
Literal,
Optional,
Tuple,
Union,
cast,
)
from typing import cast as type_cast
import numpy as np
from numpy import dtype as npdtype
from typeguard import typechecked
import arkouda.numpy.dtypes
from arkouda.infoclass import information, list_symbol_table
from arkouda.logger import ArkoudaLogger, get_arkouda_logger
from arkouda.numpy.dtypes import (
NUMBER_FORMAT_STRINGS,
bool_scalars,
int_scalars,
numeric_scalars,
resolve_scalar_dtype,
str_scalars,
)
from arkouda.numpy.dtypes import int64 as akint64
from arkouda.numpy.pdarrayclass import RegistrationError, create_pdarray, parse_single_value, pdarray
from arkouda.numpy.pdarrayclass import all as akall
from arkouda.pandas.match import Match, MatchType
if TYPE_CHECKING:
from arkouda.numpy.sorting import SortingAlgorithm
else:
from enum import Enum
class SortingAlgorithm(Enum):
RadixSortLSD = "RadixSortLSD"
__all__ = ["Strings"]
# Command strings for message passing to arkouda server, specific to Strings
CMD_ASSEMBLE = "segStr-assemble"
CMD_TO_NDARRAY = "segStr-tondarray"
[docs]
class Strings:
"""
Represents an array of strings whose data resides on the
arkouda server. The user should not call this class directly;
rather its instances are created by other arkouda functions.
Attributes
----------
entry : pdarray
Encapsulation of a Segmented Strings array contained on
the arkouda server. This is a composite of
- offsets array: starting indices for each string
- bytes array: raw bytes of all strings joined by nulls
size : int_scalars
The number of strings in the array
nbytes : int_scalars
The total number of bytes in all strings
ndim : int_scalars
The rank of the array (currently only rank 1 arrays supported)
shape : tuple
The sizes of each dimension of the array
dtype : type
The dtype is ak.str_
logger : ArkoudaLogger
Used for all logging operations
Notes
-----
Strings is composed of two pdarrays: (1) offsets, which contains the
starting indices for each string and (2) bytes, which contains the
raw bytes of all strings, delimited by nulls.
"""
entry: pdarray
size: int_scalars
nbytes: int_scalars
ndim: int_scalars
shape: Tuple[int]
logger: ArkoudaLogger
BinOps = frozenset(["==", "!="])
objType = "Strings"
[docs]
@staticmethod
def from_return_msg(rep_msg: str) -> Strings:
"""
Create a Strings object from an Arkouda server response message.
Parse the server’s response descriptor and construct a `Strings` array
with its underlying pdarray and total byte size.
Parameters
----------
rep_msg : str
Server response message of the form:
```
created <name> <type> <size> <ndim> <shape> <itemsize>+... bytes.size <total_bytes>
```
For example:
```
"created foo Strings 3 1 (3,) 8+created bytes.size 24"
```
Returns
-------
Strings
A `Strings` object representing the segmented strings array on the server,
initialized with the returned pdarray and byte-size metadata.
Raises
------
RuntimeError
If the response message cannot be parsed or does not match the expected format.
Examples
--------
>>> import arkouda as ak
# Example response message (typically from `generic_msg`)
>>> rep_msg = "created foo Strings 3 1 (3,) 8+created bytes.size 24"
>>> s = ak.Strings.from_return_msg(rep_msg)
>>> isinstance(s, ak.Strings)
True
"""
left, right = cast(str, rep_msg).split("+")
try:
bytes_size = int(right.split()[-1])
except Exception as e:
raise RuntimeError(f"Cannot parse byte size from response: {rep_msg}") from e
return Strings(create_pdarray(left), bytes_size)
[docs]
@staticmethod
def from_parts(offset_attrib: Union[pdarray, str], bytes_attrib: Union[pdarray, str]) -> Strings:
"""
Assemble a Strings object from separate offset and bytes arrays.
This factory method constructs a segmented `Strings` array by sending two
separate components—offsets and values—to the Arkouda server and instructing
it to assemble them into a single `Strings` object. Use this when offsets
and byte data are created or transported independently.
Parameters
----------
offset_attrib : pdarray or str
The array of starting positions for each string, or a string
expression that can be passed to `create_pdarray` to build it.
bytes_attrib : pdarray or str
The array of raw byte values (e.g., uint8 character codes), or a string
expression that can be passed to `create_pdarray` to build it.
Returns
-------
Strings
A `Strings` object representing the assembled segmented strings array
on the Arkouda server.
Raises
------
RuntimeError
If conversion of `offset_attrib` or `bytes_attrib` to `pdarray` fails,
or if the server is unable to assemble the parts into a `Strings`.
Notes
-----
- Both inputs can be existing `pdarray` instances or arguments suitable
for `create_pdarray`.
- Internally uses the `CMD_ASSEMBLE` command to merge offsets and values.
"""
from arkouda.client import generic_msg
if not isinstance(offset_attrib, pdarray):
try:
offset_attrib = create_pdarray(offset_attrib)
except Exception as e:
raise RuntimeError(f"Failed to convert offsets: {e}") from e
if not isinstance(bytes_attrib, pdarray):
try:
bytes_attrib = create_pdarray(bytes_attrib)
except Exception as e:
raise RuntimeError(f"Failed to convert values: {e}") from e
response = cast(
str,
generic_msg(
cmd=CMD_ASSEMBLE,
args={"offsets": offset_attrib, "values": bytes_attrib},
),
)
return Strings.from_return_msg(response)
def __init__(self, strings_pdarray: pdarray, bytes_size: int_scalars) -> None:
"""
Initialize the Strings instance by setting all instance
attributes, some of which are derived from the array parameters.
Parameters
----------
strings_pdarray : pdarray
the array containing the meta-info on a server side strings object
bytes_size : int_scalars
length of the bytes array contained on the server aka total bytes
Raises
------
RuntimeError
Raised if there's an error converting a server-returned str-descriptor
or pdarray to either the offset_attrib or bytes_attrib
ValueError
Raised if there's an error in generating instance attributes
from either the offset_attrib or bytes_attrib parameter
"""
self.entry: pdarray = strings_pdarray
self.registered_name: Optional[str] = None
try:
self.size = self.entry.size
self.nbytes = bytes_size # This is a deficiency of server GenSymEntry right now
self.ndim = self.entry.ndim
self.shape = self.entry.shape
self.name: Optional[str] = self.entry.name
except Exception as e:
raise ValueError(e)
self._bytes: Optional[pdarray] = None
self._offsets: Optional[pdarray] = None
self._regex_dict: Dict = dict()
self.logger = get_arkouda_logger(name=__class__.__name__) # type: ignore
"""
NOTE:
The Strings.__del__() method should NOT be implemented.
Python will invoke the __del__() of any components by default.
Overriding this default behavior with an explicitly specified Strings.__del__() method may
introduce unknown symbol errors.
By allowing Python's garbage collecting to handle this automatically, we avoid extra maintenance
"""
def __iter__(self):
raise NotImplementedError(
"Strings does not support iteration. To force data transfer from server, use to_ndarray."
)
def __len__(self) -> int:
return self.shape[0]
def __str__(self) -> str:
from arkouda.client import pdarrayIterThresh
if self.size <= pdarrayIterThresh:
vals = [f"'{self[i]}'" for i in range(self.size)]
else:
vals = [f"'{self[i]}'" for i in range(3)]
vals.append("... ")
vals.extend([f"'{self[i]}'" for i in range(self.size - 3, self.size)])
return "[{}]".format(", ".join(vals))
def __repr__(self) -> str:
return f"array({self.__str__()})"
@typechecked
def _binop(self, other: Union[Strings, str_scalars], op: str) -> pdarray:
"""
Execute the requested binop on this Strings instance and the
parameter Strings object and returns the results within
a pdarray object.
Parameters
----------
other : Strings or str_scalars
the other object is a Strings object
op : str
name of the binary operation to be performed
Returns
-------
pdarray
encapsulating the results of the requested binop
Raises
------
ValueError
Raised if (1) the op is not in the self.BinOps set, or (2) if the
sizes of this and the other instance don't match, or (3) the other
object is not a Strings object
RuntimeError
Raised if a server-side error is thrown while executing the
binary operation
"""
from arkouda.client import generic_msg
if op not in self.BinOps:
raise ValueError(f"Strings: unsupported operator: {op}")
if isinstance(other, Strings):
if self.size != other.size:
raise ValueError(f"Strings: size mismatch {self.size} {other.size}")
cmd = "segmentedBinopvv"
args = {
"op": op,
"objType": self.objType,
"obj": self.entry,
"otherType": other.objType,
"other": other.entry,
"left": False, # placeholder for stick
"delim": "", # placeholder for stick
}
elif resolve_scalar_dtype(other) == "str":
cmd = "segmentedBinopvs"
args = {
"op": op,
"objType": self.objType,
"obj": self.entry,
"otherType": "str",
"other": other,
}
else:
raise ValueError(
f"Strings: {op} not supported between Strings and {other.__class__.__name__}"
)
return create_pdarray(generic_msg(cmd=cmd, args=args))
def __eq__(self, other): # type: ignore
if self.size > 0:
return self._binop(other, "==")
else:
from arkouda import array as ak_array
return ak_array([], dtype="bool")
def __ne__(self, other):
if self.size > 0: # type: ignore
return self._binop(cast(Strings, other), "!=")
else:
from arkouda import array as ak_array
return ak_array([], dtype="bool")
def __getitem__(self, key):
from arkouda.client import generic_msg
if np.isscalar(key) and (resolve_scalar_dtype(key) in ["int64", "uint64"]):
orig_key = key
if key < 0:
# Interpret negative key as offset from end of array
key += self.size
if key >= 0 and key < self.size:
rep_msg = generic_msg(
cmd="segmentedIndex",
args={
"subcmd": "intIndex",
"objType": self.objType,
"dtype": self.entry.dtype,
"obj": self.entry,
"key": key,
},
)
_, value = rep_msg.split(maxsplit=1)
return parse_single_value(value)
else:
raise IndexError(f"[int] {orig_key} is out of bounds with size {self.size}")
elif isinstance(key, slice):
(start, stop, stride) = key.indices(self.size)
self.logger.debug(f"start: {start}; stop: {stop}; stride: {stride}")
rep_msg = generic_msg(
cmd="segmentedIndex",
args={
"subcmd": "sliceIndex",
"objType": self.objType,
"obj": self.entry,
"dtype": self.entry.dtype,
"key": [start, stop, stride],
},
)
return Strings.from_return_msg(rep_msg)
elif isinstance(key, pdarray):
if key.dtype not in ("bool", "int", "uint"):
raise TypeError(f"unsupported pdarray index type {key.dtype}")
if key.dtype == "bool" and self.size != key.size:
raise ValueError(f"size mismatch {self.size} {key.size}")
rep_msg = generic_msg(
cmd="segmentedIndex",
args={
"subcmd": "pdarrayIndex",
"objType": self.objType,
"dtype": self.entry.dtype,
"obj": self.entry,
"key": key,
},
)
return Strings.from_return_msg(rep_msg)
elif isinstance(key, np.ndarray):
# convert numpy array to pdarray
from arkouda.numpy.pdarraycreation import array as ak_array
return self[ak_array(key)]
else:
raise TypeError(f"unsupported pdarray index type {key.__class__.__name__}")
@property
def dtype(self) -> npdtype:
"""Return the dtype object of the underlying data."""
return npdtype("<U")
@property
def inferred_type(self) -> str:
"""Return a string of the type inferred from the values."""
return "string"
[docs]
def copy(self) -> Strings:
"""
Return a deep copy of the Strings object.
Returns
-------
Strings
A deep copy of the Strings.
"""
from arkouda.pdarraycreation import array
ret = array(self, copy=True)
if isinstance(ret, Strings):
return ret
else:
raise RuntimeError("Could not copy Strings object.")
[docs]
def equals(self, other) -> bool_scalars:
"""
Whether Strings are the same size and all entries are equal.
Parameters
----------
other : Any
object to compare.
Returns
-------
bool_scalars
True if the Strings are the same, o.w. False.
Examples
--------
>>> import arkouda as ak
>>> s = ak.array(["a", "b", "c"])
>>> s_cpy = ak.array(["a", "b", "c"])
>>> s.equals(s_cpy)
np.True_
>>> s2 = ak.array(["a", "x", "c"])
>>> s.equals(s2)
np.False_
"""
if isinstance(other, Strings):
if other.size != self.size:
return False
else:
result = akall(self == other)
if isinstance(result, (bool, np.bool_)):
return result
return False
[docs]
def get_lengths(self) -> pdarray:
"""
Return the length of each string in the array.
Returns
-------
pdarray
The length of each string
Raises
------
RuntimeError
Raised if there is a server-side error thrown
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(cmd="segmentLengths", args={"objType": self.objType, "obj": self.entry})
)
[docs]
def get_bytes(self) -> pdarray:
"""
Getter for the bytes component (uint8 pdarray) of this Strings.
Returns
-------
pdarray
Pdarray of bytes of the string accessed
Example
-------
>>> import arkouda as ak
>>> x = ak.array(['one', 'two', 'three'])
>>> x.get_bytes()
array([111 110 101 0 116 119 111 0 116 104 114 101 101 0])
"""
from arkouda.client import generic_msg
if self._bytes is None or self._bytes.name not in list_symbol_table():
self._bytes = create_pdarray(
generic_msg(
cmd="getSegStringProperty", args={"property": "get_bytes", "obj": self.entry}
)
)
if self._bytes is None:
raise RuntimeError("Failed to initialize the bytes property.")
return self._bytes
[docs]
def get_offsets(self) -> pdarray:
"""
Getter for the offsets component (int64 pdarray) of this Strings.
Returns
-------
pdarray
Pdarray of offsets of the string accessed
Example
-------
>>> import arkouda as ak
>>> x = ak.array(['one', 'two', 'three'])
>>> x.get_offsets()
array([0 4 8])
"""
from arkouda.client import generic_msg
if self._offsets is None or self._offsets.name not in list_symbol_table():
self._offsets = create_pdarray(
generic_msg(
cmd="getSegStringProperty", args={"property": "get_offsets", "obj": self.entry}
)
)
if self._offsets is None:
raise RuntimeError("Failed to initialize the offsets property.")
return self._offsets
[docs]
def encode(self, toEncoding: str, fromEncoding: str = "UTF-8") -> Strings:
"""
Return a new strings object in `toEncoding`, expecting that the
current Strings is encoded in `fromEncoding`.
Parameters
----------
toEncoding: str
The encoding that the strings will be converted to
fromEncoding : str, default="UTF-8"
The current encoding of the strings object, default to
UTF-8
Returns
-------
Strings
A new Strings object in `toEncoding`
Raises
------
RuntimeError
Raised if there is a server-side error thrown
"""
from arkouda.client import generic_msg
if (toEncoding.upper() == "IDNA" and fromEncoding.upper() != "UTF-8") or (
toEncoding.upper() != "UTF-8" and fromEncoding.upper() == "IDNA"
):
# first convert to UTF-8
rep_msg = generic_msg(
cmd="encode",
args={
"toEncoding": "UTF-8",
"fromEncoding": fromEncoding,
"obj": self.entry,
},
)
intermediate = Strings.from_return_msg(cast(str, rep_msg))
# now go to idna
rep_msg = generic_msg(
cmd="encode",
args={
"toEncoding": toEncoding,
"fromEncoding": "UTF-8",
"obj": intermediate,
},
)
return Strings.from_return_msg(cast(str, rep_msg))
rep_msg = generic_msg(
cmd="encode",
args={
"toEncoding": toEncoding,
"fromEncoding": fromEncoding,
"obj": self.entry,
},
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
def decode(self, fromEncoding: str, toEncoding: str = "UTF-8") -> Strings:
"""
Return a new strings object in `fromEncoding`, expecting that the
current Strings is encoded in `toEncoding`.
Parameters
----------
fromEncoding: str
The current encoding of the strings object
toEncoding : str, default="UTF-8"
The encoding that the strings will be converted to,
default to UTF-8
Returns
-------
Strings
A new Strings object in `toEncoding`
Raises
------
RuntimeError
Raised if there is a server-side error thrown
"""
return self.encode(toEncoding, fromEncoding)
[docs]
@typechecked
def lower(self) -> Strings:
"""
Return a new Strings with all uppercase characters from the original replaced with
their lowercase equivalent.
Returns
-------
Strings
Strings with all uppercase characters from the original replaced with
their lowercase equivalent
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.upper
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array([f'StrINgS {i}' for i in range(5)])
>>> strings
array(['StrINgS 0', 'StrINgS 1', 'StrINgS 2', 'StrINgS 3', 'StrINgS 4'])
>>> strings.lower()
array(['strings 0', 'strings 1', 'strings 2', 'strings 3', 'strings 4'])
"""
from arkouda.client import generic_msg
rep_msg = generic_msg(
cmd="caseChange", args={"subcmd": "toLower", "objType": self.objType, "obj": self.entry}
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
@typechecked
def upper(self) -> Strings:
"""
Return a new Strings with all lowercase characters from the original replaced with
their uppercase equivalent.
Returns
-------
Strings
Strings with all lowercase characters from the original replaced with
their uppercase equivalent
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.lower
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array([f'StrINgS {i}' for i in range(5)])
>>> strings
array(['StrINgS 0', 'StrINgS 1', 'StrINgS 2', 'StrINgS 3', 'StrINgS 4'])
>>> strings.upper()
array(['STRINGS 0', 'STRINGS 1', 'STRINGS 2', 'STRINGS 3', 'STRINGS 4'])
"""
from arkouda.client import generic_msg
rep_msg = generic_msg(
cmd="caseChange", args={"subcmd": "toUpper", "objType": self.objType, "obj": self.entry}
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
@typechecked
def title(self) -> Strings:
"""
Return a new Strings from the original replaced with their titlecase equivalent.
Returns
-------
Strings
Strings from the original replaced with their titlecase equivalent.
Raises
------
RuntimeError
Raised if there is a server-side error thrown.
See Also
--------
Strings.lower
String.upper
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array([f'StrINgS {i}' for i in range(5)])
>>> strings
array(['StrINgS 0', 'StrINgS 1', 'StrINgS 2', 'StrINgS 3', 'StrINgS 4'])
>>> strings.title()
array(['Strings 0', 'Strings 1', 'Strings 2', 'Strings 3', 'Strings 4'])
"""
from arkouda.client import generic_msg
rep_msg = generic_msg(
cmd="caseChange", args={"subcmd": "toTitle", "objType": self.objType, "obj": self.entry}
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
@typechecked
def isdecimal(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings has all decimal characters.
Returns
-------
pdarray
True for elements that are decimals, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.isdigit
Examples
--------
>>> import arkouda as ak
>>> not_decimal = ak.array([f'Strings {i}' for i in range(3)])
>>> decimal = ak.array([f'12{i}' for i in range(3)])
>>> strings = ak.concatenate([not_decimal, decimal])
>>> strings
array(['Strings 0', 'Strings 1', 'Strings 2', '120', '121', '122'])
>>> strings.isdecimal()
array([False False False True True True])
Special Character Examples
>>> special_strings = ak.array(["3.14", "\u0030", "\u00b2", "2³₇", "2³x₇"])
>>> special_strings
array(['3.14', '0', '²', '2³₇', '2³x₇'])
>>> special_strings.isdecimal()
array([False True False False False])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars",
args={"subcmd": "isDecimal", "objType": self.objType, "obj": self.entry},
)
)
[docs]
@typechecked
def isnumeric(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings has all numeric characters. There are 1922 unicode characters that
qualify as numeric, including the digits 0 through 9, superscripts and
subscripted digits, special characters with the digits encircled or
enclosed in parens, "vulgar fractions," and more.
Returns
-------
pdarray
True for elements that are numerics, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.isdecimal
Examples
--------
>>> import arkouda as ak
>>> not_numeric = ak.array([f'Strings {i}' for i in range(3)])
>>> numeric = ak.array([f'12{i}' for i in range(3)])
>>> strings = ak.concatenate([not_numeric, numeric])
>>> strings
array(['Strings 0', 'Strings 1', 'Strings 2', '120', '121', '122'])
>>> strings.isnumeric()
array([False False False True True True])
Special Character Examples
>>> special_strings = ak.array(["3.14", "\u0030", "\u00b2", "2³₇", "2³x₇"])
>>> special_strings
array(['3.14', '0', '²', '2³₇', '2³x₇'])
>>> special_strings.isnumeric()
array([False True True True False])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars",
args={"subcmd": "isNumeric", "objType": self.objType, "obj": self.entry},
)
)
[docs]
@typechecked
def capitalize(self) -> Strings:
"""
Return a new Strings from the original replaced with the first letter capitilzed
and the remaining letters lowercase.
Returns
-------
Strings
Strings from the original replaced with the capitalized equivalent.
Raises
------
RuntimeError
Raised if there is a server-side error thrown.
See Also
--------
Strings.lower
String.upper
String.title
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array([f'StrINgS aRe Here {i}' for i in range(5)])
>>> strings
array(['StrINgS aRe Here 0', 'StrINgS aRe Here 1', 'StrINgS aRe Here 2', \
'StrINgS aRe Here 3', 'StrINgS aRe Here 4'])
>>> strings.title()
array(['Strings Are Here 0', 'Strings Are Here 1', 'Strings Are Here 2', \
'Strings Are Here 3', 'Strings Are Here 4'])
"""
from arkouda.client import generic_msg
rep_msg = generic_msg(
cmd="caseChange", args={"subcmd": "capitalize", "objType": self.objType, "obj": self.entry}
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
@typechecked
def islower(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings is entirely lowercase.
Returns
-------
pdarray
True for elements that are entirely lowercase, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.isupper
Examples
--------
>>> import arkouda as ak
>>> lower = ak.array([f'strings {i}' for i in range(3)])
>>> upper = ak.array([f'STRINGS {i}' for i in range(3)])
>>> strings = ak.concatenate([lower, upper])
>>> strings
array(['strings 0', 'strings 1', 'strings 2', 'STRINGS 0', 'STRINGS 1', 'STRINGS 2'])
>>> strings.islower()
array([True True True False False False])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isLower", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def isupper(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings is entirely uppercase.
Returns
-------
pdarray
True for elements that are entirely uppercase, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Examples
--------
>>> import arkouda as ak
>>> lower = ak.array([f'strings {i}' for i in range(3)])
>>> upper = ak.array([f'STRINGS {i}' for i in range(3)])
>>> strings = ak.concatenate([lower, upper])
>>> strings
array(['strings 0', 'strings 1', 'strings 2', 'STRINGS 0', 'STRINGS 1', 'STRINGS 2'])
>>> strings.isupper()
array([False False False True True True])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isUpper", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def istitle(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings is titlecase.
Returns
-------
pdarray
True for elements that are titlecase, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Strings.isupper
Examples
--------
>>> import arkouda as ak
>>> mixed = ak.array([f'sTrINgs {i}' for i in range(3)])
>>> title = ak.array([f'Strings {i}' for i in range(3)])
>>> strings = ak.concatenate([mixed, title])
>>> strings
array(['sTrINgs 0', 'sTrINgs 1', 'sTrINgs 2', 'Strings 0', 'Strings 1', 'Strings 2'])
>>> strings.istitle()
array([False False False True True True])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isTitle", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def isalnum(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings is alphanumeric.
Returns
-------
pdarray
True for elements that are alphanumeric, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Strings.isupper
Strings.istitle
Examples
--------
>>> import arkouda as ak
>>> not_alnum = ak.array([f'%Strings {i}' for i in range(3)])
>>> alnum = ak.array([f'Strings{i}' for i in range(3)])
>>> strings = ak.concatenate([not_alnum, alnum])
>>> strings
array(['%Strings 0', '%Strings 1', '%Strings 2', 'Strings0', 'Strings1', 'Strings2'])
>>> strings.isalnum()
array([False False False True True True])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isalnum", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def isalpha(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings is alphabetic. This means there is at least one character,
and all the characters are alphabetic.
Returns
-------
pdarray
True for elements that are alphabetic, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Strings.isupper
Strings.istitle
Strings.isalnum
Examples
--------
>>> import arkouda as ak
>>> not_alpha = ak.array([f'%Strings {i}' for i in range(3)])
>>> alpha = ak.array(['StringA','StringB','StringC'])
>>> strings = ak.concatenate([not_alpha, alpha])
>>> strings
array(['%Strings 0', '%Strings 1', '%Strings 2', 'StringA', 'StringB', 'StringC'])
>>> strings.isalpha()
array([False False False True True True])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isalpha", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def isdigit(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings has all digit characters.
Returns
-------
pdarray
True for elements that are digits, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Strings.isupper
Strings.istitle
Examples
--------
>>> import arkouda as ak
>>> not_digit = ak.array([f'Strings {i}' for i in range(3)])
>>> digit = ak.array([f'12{i}' for i in range(3)])
>>> strings = ak.concatenate([not_digit, digit])
>>> strings
array(['Strings 0', 'Strings 1', 'Strings 2', '120', '121', '122'])
>>> strings.isdigit()
array([False False False True True True])
Special Character Examples
>>> special_strings = ak.array(["3.14", "\u0030", "\u00b2", "2³₇", "2³x₇"])
>>> special_strings
array(['3.14', '0', '²', '2³₇', '2³x₇'])
>>> special_strings.isdigit()
array([False True True True False])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isdigit", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def isempty(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i of the
Strings is empty.
True for elements that are the empty string, False otherwise
Returns
-------
pdarray
True for elements that are digits, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Strings.isupper
Strings.istitle
Examples
--------
>>> import arkouda as ak
>>> not_empty = ak.array([f'Strings {i}' for i in range(3)])
>>> empty = ak.array(['' for i in range(3)])
>>> strings = ak.concatenate([not_empty, empty])
>>> strings
array(['Strings 0', 'Strings 1', 'Strings 2', '', '', ''])
>>> strings.isempty()
array([False False False True True True])
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isempty", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def isspace(self) -> pdarray:
"""
Return a boolean pdarray where index i indicates whether string i has all
whitespace characters (‘ ’, ‘\t’, ‘\n’, ‘\v’, ‘\f’, ‘\r’).
Returns
-------
pdarray
True for elements that are whitespace, False otherwise
Raises
------
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.islower
Strings.isupper
Strings.istitle
Examples
--------
>>> import arkouda as ak
>>> not_space = ak.array([f'Strings {i}' for i in range(3)])
>>> space = ak.array([' ', '\\t', '\\n', '\\v', '\\f', '\\r', ' \\t\\n\\v\\f\\r'])
>>> strings = ak.concatenate([not_space, space])
>>> strings
array(['Strings 0', 'Strings 1', 'Strings 2', ' ', 'u0009', 'n', \
'u000B', 'u000C', 'u000D', ' u0009nu000Bu000Cu000D'])
>>> strings.isspace()
array([False False False True True True True True True True])
""" # noqa: D301
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(
cmd="checkChars", args={"subcmd": "isspace", "objType": self.objType, "obj": self.entry}
)
)
[docs]
@typechecked
def strip(self, chars: Optional[Union[bytes, str_scalars]] = "") -> Strings:
"""
Return a new Strings object with all leading and trailing occurrences of characters contained
in chars removed. The chars argument is a string specifying the set of characters to be removed.
If omitted, the chars argument defaults to removing whitespace. The chars argument is not a
prefix or suffix; rather, all combinations of its values are stripped.
Parameters
----------
chars : bytes or str_scalars, optional
the set of characters to be removed
Returns
-------
Strings
Strings object with the leading and trailing characters matching the set of characters in
the chars argument removed
Raises
------
RuntimeError
Raised if there is a server-side error thrown
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['Strings ', ' StringS ', 'StringS '])
>>> s = strings.strip()
>>> s
array(['Strings', 'StringS', 'StringS'])
>>> strings = ak.array(['Strings 1', '1 StringS ', ' 1StringS 12 '])
>>> s = strings.strip(' 12')
>>> s
array(['Strings', 'StringS', 'StringS'])
"""
from arkouda.client import generic_msg
if isinstance(chars, bytes):
chars = chars.decode()
rep_msg = generic_msg(
cmd="segmentedStrip", args={"objType": self.objType, "name": self.entry, "chars": chars}
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
@typechecked
def cached_regex_patterns(self) -> List:
"""Returns the regex patterns for which Match objects have been cached."""
return list(self._regex_dict.keys())
[docs]
@typechecked
def purge_cached_regex_patterns(self) -> None:
"""Purges cached regex patterns."""
self._regex_dict = dict()
def _empty_pattern_verification(self, pattern):
if pattern == "$" or (re.search(pattern, "") and (self == "").any()): # type: ignore
# TODO remove once changes from chapel issue #20431 and #20441 are in arkouda
raise ValueError(
"regex operations not currently supported with a pattern='$' or pattern='' when "
"the empty string is contained in Strings"
)
def _get_matcher(self, pattern: Union[bytes, str_scalars], create: bool = True):
"""Internal function to fetch cached Matcher objects."""
from arkouda.pandas.matcher import Matcher
if isinstance(pattern, bytes):
pattern = pattern.decode()
try:
re.compile(pattern)
except Exception as e:
raise ValueError(e)
self._empty_pattern_verification(pattern)
matcher = None
if pattern in self._regex_dict:
matcher = self._regex_dict[pattern]
elif create:
self._regex_dict[pattern] = Matcher(pattern=pattern, parent_entry_name=self.entry.name)
matcher = self._regex_dict[pattern]
return matcher
[docs]
@typechecked
def find_locations(self, pattern: Union[bytes, str_scalars]) -> Tuple[pdarray, pdarray, pdarray]:
r"""
Finds pattern matches and returns pdarrays containing the number, start postitions,
and lengths of matches.
Parameters
----------
pattern : bytes or str_scalars
The regex pattern used to find matches
Returns
-------
Tuple[pdarray, pdarray, pdarray]
pdarray, int64
For each original string, the number of pattern matches
pdarray, int64
The start positons of pattern matches
pdarray, int64
The lengths of pattern matches
Raises
------
TypeError
Raised if the pattern parameter is not bytes or str_scalars
ValueError
Raised if pattern is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.findall, Strings.match
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array([f'{i} string {i}' for i in range(1, 6)])
>>> num_matches, starts, lens = strings.find_locations('\\d')
>>> num_matches
array([2 2 2 2 2])
>>> starts
array([0 9 0 9 0 9 0 9 0 9])
>>> lens
array([1 1 1 1 1 1 1 1 1 1])
"""
matcher = self._get_matcher(pattern)
matcher.find_locations()
return matcher.num_matches, matcher.starts, matcher.lengths
[docs]
@typechecked
def search(self, pattern: Union[bytes, str_scalars]) -> Match:
"""
Return a match object with the first location in each element where pattern produces a match.
Elements match if any part of the string matches the regular expression pattern.
Parameters
----------
pattern : bytes or str_scalars
Regex used to find matches
Returns
-------
Match
Match object where elements match if any part of the string matches the
regular expression pattern
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.search('_+')
<ak.Match object: matched=True, span=(1, 2); matched=True, span=(0, 4);
matched=False; matched=True, span=(0, 2); matched=False>
"""
return self._get_matcher(pattern).get_match(MatchType.SEARCH, self)
[docs]
@typechecked
def match(self, pattern: Union[bytes, str_scalars]) -> Match:
"""
Return a match object where elements match only if the beginning of the string matches the
regular expression pattern.
Parameters
----------
pattern : bytes or str_scalars
Regex used to find matches
Returns
-------
Match
Match object where elements match only if the beginning of the string matches the
regular expression pattern
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.match('_+')
<ak.Match object: matched=False; matched=True, span=(0, 4); matched=False;
matched=True, span=(0, 2); matched=False>
"""
return self._get_matcher(pattern).get_match(MatchType.MATCH, self)
[docs]
@typechecked()
def fullmatch(self, pattern: Union[bytes, str_scalars]) -> Match:
"""
Return a match object where elements match only if the whole string matches the
regular expression pattern.
Parameters
----------
pattern : bytes or str_scalars
Regex used to find matches
Returns
-------
Match
Match object where elements match only if the whole string matches the
regular expression pattern
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.fullmatch('_+')
<ak.Match object: matched=False; matched=True, span=(0, 4); matched=False;
matched=False; matched=False>
"""
return self._get_matcher(pattern).get_match(MatchType.FULLMATCH, self)
[docs]
@typechecked()
def regex_split(
self, pattern: Union[bytes, str_scalars], maxsplit: int = 0, return_segments: bool = False
) -> Union[Strings, Tuple]:
"""
Return a new Strings split by the occurrences of pattern.
If maxsplit is nonzero, at most maxsplit splits occur.
Parameters
----------
pattern : bytes or str_scalars
Regex used to split strings into substrings
maxsplit : int, default=0
The max number of pattern match occurences in each element to split.
The default maxsplit=0 splits on all occurences
return_segments : bool, default=False
If True, return mapping of original strings to first substring
in return array.
Returns
-------
Union[Strings, Tuple]
Strings
Substrings with pattern matches removed
pdarray, int64 (optional)
For each original string, the index of first corresponding substring
in the return array
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.regex_split('_+', maxsplit=2, return_segments=True)
(array(['1', '2', '', '', '', '3', '', '4', '5____6___7', '']), array([0 3 5 6 9]))
"""
return self._get_matcher(pattern).split(maxsplit, return_segments)
[docs]
@typechecked
def findall(
self, pattern: Union[bytes, str_scalars], return_match_origins: bool = False
) -> Union[Strings, Tuple]:
"""
Return a new Strings containg all non-overlapping matches of pattern.
Parameters
----------
pattern : bytes or str_scalars
Regex used to find matches
return_match_origins : bool, default=False
If True, return a pdarray containing the index of the original string each
pattern match is from
Returns
-------
Union[Strings, Tuple]
Strings
Strings object containing only pattern matches
pdarray, int64 (optional)
The index of the original string each pattern match is from
Raises
------
TypeError
Raised if the pattern parameter is not bytes or str_scalars
ValueError
Raised if pattern is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.find_locations
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.findall('_+', return_match_origins=True)
(array(['_', '___', '____', '__', '___', '____', '___']), array([0 0 1 3 3 3 3]))
"""
return self._get_matcher(pattern).findall(return_match_origins)
[docs]
@typechecked()
def sub(
self, pattern: Union[bytes, str_scalars], repl: Union[bytes, str_scalars], count: int = 0
) -> Strings:
"""
Return new Strings obtained by replacing non-overlapping occurrences of pattern with the
replacement repl.
If count is nonzero, at most count substitutions occur.
Parameters
----------
pattern : bytes or str_scalars
The regex to substitue
repl : bytes or str_scalars
The substring to replace pattern matches with
count : int, default=0
The max number of pattern match occurences in each element to replace.
The default count=0 replaces all occurences of pattern with repl
Returns
-------
Strings
Strings with pattern matches replaced
Raises
------
TypeError
Raised if pattern or repl are not bytes or str_scalars
ValueError
Raised if pattern is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.subn
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.sub(pattern='_+', repl='-', count=2)
array(['1-2-', '-', '3', '-4-5____6___7', ''])
"""
if isinstance(repl, bytes):
repl = repl.decode()
return self._get_matcher(pattern).sub(repl, count)
[docs]
@typechecked()
def subn(
self, pattern: Union[bytes, str_scalars], repl: Union[bytes, str_scalars], count: int = 0
) -> Tuple[Strings, pdarray]:
"""
Perform the same operation as sub(), but return a tuple (new_Strings, number_of_substitions).
Parameters
----------
pattern : bytes or str_scalars
The regex to substitue
repl : bytes or str_scalars
The substring to replace pattern matches with
count : int, default=0
The max number of pattern match occurences in each element to replace.
The default count=0 replaces all occurences of pattern with repl
Returns
-------
Tuple[Strings, pdarray]
Strings
Strings with pattern matches replaced
pdarray, int64
The number of substitutions made for each element of Strings
Raises
------
TypeError
Raised if pattern or repl are not bytes or str_scalars
ValueError
Raised if pattern is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.sub
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.subn(pattern='_+', repl='-', count=2)
(array(['1-2-', '-', '3', '-4-5____6___7', '']), array([2 1 0 2 0]))
"""
if isinstance(repl, bytes):
repl = repl.decode()
return self._get_matcher(pattern).sub(repl, count, return_num_subs=True)
[docs]
@typechecked
def contains(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pdarray:
r"""
Check whether each element contains the given substring.
Parameters
----------
substr : bytes or str_scalars
The substring in the form of string or byte array to search for
regex : bool, default=False
Indicates whether substr is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
pdarray
True for elements that contain substr, False otherwise
Raises
------
TypeError
Raised if the substr parameter is not bytes or str_scalars
ValueError
Rasied if substr is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.startswith, Strings.endswith
Examples
--------
>>> import arkouda as ak
>>> strings = ak.array([f'{i} string {i}' for i in range(1, 6)])
>>> strings
array(['1 string 1', '2 string 2', '3 string 3', '4 string 4', '5 string 5'])
>>> strings.contains('string')
array([True True True True True])
>>> strings.contains('string \\d', regex=True)
array([True True True True True])
"""
from arkouda.client import generic_msg
if isinstance(substr, bytes):
substr = substr.decode()
if not regex:
substr = re.escape(substr)
self._empty_pattern_verification(substr)
matcher = self._get_matcher(substr, create=False)
if matcher is not None:
return matcher.get_match(MatchType.SEARCH, self).matched()
return create_pdarray(
generic_msg(
cmd="segmentedSearch",
args={"objType": self.objType, "obj": self.entry, "valType": "str", "val": substr},
)
)
[docs]
@typechecked
def startswith(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pdarray:
r"""
Check whether each element starts with the given substring.
Parameters
----------
substr : bytes or str_scalars
The prefix to search for
regex : bool, default=False
Indicates whether substr is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
pdarray
True for elements that start with substr, False otherwise
Raises
------
TypeError
Raised if the substr parameter is not a bytes ior str_scalars
ValueError
Rasied if substr is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.contains, Strings.endswith
Examples
--------
>>> import arkouda as ak
>>> strings_end = ak.array([f'string {i}' for i in range(1, 6)])
>>> strings_end
array(['string 1', 'string 2', 'string 3', 'string 4', 'string 5'])
>>> strings_end.startswith('string')
array([True True True True True])
>>> strings_start = ak.array([f'{i} string' for i in range(1,6)])
>>> strings_start
array(['1 string', '2 string', '3 string', '4 string', '5 string'])
>>> strings_start.startswith('\\d str', regex = True)
array([True True True True True])
"""
if isinstance(substr, bytes):
substr = substr.decode()
if not regex:
substr = re.escape(substr)
self._empty_pattern_verification(substr)
matcher = self._get_matcher(substr, create=False)
if matcher is not None:
return matcher.get_match(MatchType.MATCH, self).matched()
else:
return self.contains("^" + substr, regex=True)
[docs]
@typechecked
def endswith(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pdarray:
r"""
Check whether each element ends with the given substring.
Parameters
----------
substr : bytes or str_scalars
The suffix to search for
regex : bool, default=False
Indicates whether substr is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
pdarray
True for elements that end with substr, False otherwise
Raises
------
TypeError
Raised if the substr parameter is not bytes or str_scalars
ValueError
Rasied if substr is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Strings.contains, Strings.startswith
Examples
--------
>>> import arkouda as ak
>>> strings_start = ak.array([f'{i} string' for i in range(1,6)])
>>> strings_start
array(['1 string', '2 string', '3 string', '4 string', '5 string'])
>>> strings_start.endswith('ing')
array([True True True True True])
>>> strings_end = ak.array([f'string {i}' for i in range(1, 6)])
>>> strings_end
array(['string 1', 'string 2', 'string 3', 'string 4', 'string 5'])
>>> strings_end.endswith('ing \\d', regex = True)
array([True True True True True])
"""
if isinstance(substr, bytes):
substr = substr.decode()
if not regex:
substr = re.escape(substr)
self._empty_pattern_verification(substr)
return self.contains(substr + "$", regex=True)
[docs]
def split(
self, delimiter: str, return_segments: bool = False, regex: bool = False
) -> Union[Strings, Tuple]:
"""Unpack delimiter-joined substrings into a flat array.
Parameters
----------
delimiter: str
Characters used to split strings into substrings
return_segments : bool, default=False
If True, also return mapping of original strings to first substring
in return array.
regex : bool, default=False
Indicates whether delimiter is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
Union[Strings, Tuple]
Strings
Flattened substrings with delimiters removed
pdarray, int64 (optional)
For each original string, the index of first corresponding substring
in the return array
See Also
--------
peel, rpeel
Examples
--------
>>> import arkouda as ak
>>> orig = ak.array(['one|two', 'three|four|five', 'six'])
>>> orig.split('|')
array(['one', 'two', 'three', 'four', 'five', 'six'])
>>> flat, mapping = orig.split('|', return_segments=True)
>>> mapping
array([0 2 5])
>>> under = ak.array(['one_two', 'three_____four____five', 'six'])
>>> under_split, under_map = under.split('_+', return_segments=True, regex=True)
>>> under_split
array(['one', 'two', 'three', 'four', 'five', 'six'])
>>> under_map
array([0 2 5])
"""
from arkouda.client import generic_msg
if regex:
try:
re.compile(delimiter)
except Exception as e:
raise ValueError(e)
return self.regex_split(delimiter, return_segments=return_segments)
else:
cmd = "segmentedFlatten"
rep_msg = cast(
str,
generic_msg(
cmd=cmd,
args={
"values": self.entry,
"objtype": self.objType,
"return_segs": return_segments,
"regex": regex,
"delim": delimiter,
},
),
)
if return_segments:
arrays = rep_msg.split("+", maxsplit=2)
return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
else:
return Strings.from_return_msg(rep_msg)
[docs]
@typechecked
def peel(
self,
delimiter: Union[bytes, str_scalars],
times: int_scalars = 1,
includeDelimiter: bool = False,
keepPartial: bool = False,
fromRight: bool = False,
regex: bool = False,
) -> Tuple[Strings, Strings]:
"""
Peel off one or more delimited fields from each string (similar
to string.partition), returning two new arrays of strings.
*Warning*: This function is experimental and not guaranteed to work.
Parameters
----------
delimiter : bytes or str_scalars
The separator where the split will occur
times : int_scalars, default=1
The number of times the delimiter is sought, i.e. skip over
the first (times-1) delimiters
includeDelimiter : bool, default=False
If true, append the delimiter to the end of the first return
array. By default, it is prepended to the beginning of the
second return array.
keepPartial : bool, default=False
If true, a string that does not contain <times> instances of
the delimiter will be returned in the first array. By default,
such strings are returned in the second array.
fromRight : bool, default=False
If true, peel from the right instead of the left (see also rpeel)
regex : bool, default=False
Indicates whether delimiter is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
Tuple[Strings, Strings]
left: Strings
The field(s) peeled from the end of each string (unless
fromRight is true)
right: Strings
The remainder of each string after peeling (unless fromRight
is true)
Raises
------
TypeError
Raised if the delimiter parameter is not byte or str_scalars, if
times is not int64, or if includeDelimiter, keepPartial, or
fromRight is not bool
ValueError
Raised if times is < 1 or if delimiter is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
rpeel, stick, lstick
Examples
--------
>>> import arkouda as ak
>>> s = ak.array(['a.b', 'c.d', 'e.f.g'])
>>> s.peel('.')
(array(['a', 'c', 'e']), array(['b', 'd', 'f.g']))
>>> s.peel('.', includeDelimiter=True)
(array(['a.', 'c.', 'e.']), array(['b', 'd', 'f.g']))
>>> s.peel('.', times=2)
(array(['', '', 'e.f']), array(['a.b', 'c.d', 'g']))
>>> s.peel('.', times=2, keepPartial=True)
(array(['a.b', 'c.d', 'e.f']), array(['', '', 'g']))
"""
from arkouda.client import generic_msg
if isinstance(delimiter, bytes):
delimiter = delimiter.decode()
if regex:
try:
re.compile(delimiter)
except Exception as e:
raise ValueError(e)
if re.search(delimiter, ""):
raise ValueError(
"peel with a pattern that matches the empty string are not currently supported"
)
if times < 1:
raise ValueError("times must be >= 1")
rep_msg = generic_msg(
cmd="segmentedPeel",
args={
"subcmd": "peel",
"objType": self.objType,
"obj": self.entry,
"valType": "str",
"times": NUMBER_FORMAT_STRINGS["int64"].format(times),
"id": NUMBER_FORMAT_STRINGS["bool"].format(includeDelimiter),
"keepPartial": NUMBER_FORMAT_STRINGS["bool"].format(keepPartial),
"lStr": NUMBER_FORMAT_STRINGS["bool"].format(not fromRight),
"regex": NUMBER_FORMAT_STRINGS["bool"].format(regex),
"delim": delimiter,
},
)
arrays = cast(str, rep_msg).split("+", maxsplit=3)
# first two created are left Strings, last two are right strings
left_str = Strings.from_return_msg("+".join(arrays[0:2]))
right_str = Strings.from_return_msg("+".join(arrays[2:4]))
return left_str, right_str
[docs]
def rpeel(
self,
delimiter: Union[bytes, str_scalars],
times: int_scalars = 1,
includeDelimiter: bool = False,
keepPartial: bool = False,
regex: bool = False,
) -> Tuple[Strings, Strings]:
"""
Peel off one or more delimited fields from the end of each string
(similar to string.rpartition), returning two new arrays of strings.
*Warning*: This function is experimental and not guaranteed to work.
Parameters
----------
delimiter : bytes or str_scalars
The separator where the split will occur
times : int_scalars, default=1
The number of times the delimiter is sought, i.e. skip over
the last (times-1) delimiters
includeDelimiter : bool, default=False
If true, prepend the delimiter to the start of the first return
array. By default, it is appended to the end of the
second return array.
keepPartial : bool, default=False
If true, a string that does not contain <times> instances of
the delimiter will be returned in the second array. By default,
such strings are returned in the first array.
regex : bool, default=False
Indicates whether delimiter is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
Tuple[Strings, Strings]
left: Strings
The remainder of the string after peeling
right: Strings
The field(s) that were peeled from the right of each string
Raises
------
TypeError
Raised if the delimiter parameter is not bytes or str_scalars or
if times is not int64
ValueError
Raised if times is < 1 or if delimiter is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
peel, stick, lstick
Examples
--------
>>> import arkouda as ak
>>> s = ak.array(['a.b', 'c.d', 'e.f.g'])
>>> s.rpeel('.')
(array(['a', 'c', 'e.f']), array(['b', 'd', 'g']))
Compared against peel
>>> s.peel('.')
(array(['a', 'c', 'e']), array(['b', 'd', 'f.g']))
"""
return self.peel(
delimiter,
times=times,
includeDelimiter=includeDelimiter,
keepPartial=keepPartial,
fromRight=True,
regex=regex,
)
[docs]
@typechecked
def stick(
self, other: Strings, delimiter: Union[bytes, str_scalars] = "", toLeft: bool = False
) -> Strings:
"""
Join the strings from another array onto one end of the strings
of this array, optionally inserting a delimiter.
*Warning*: This function is experimental and not guaranteed to work.
Parameters
----------
other : Strings
The strings to join onto self's strings
delimiter : bytes or str_scalars, default=""
String inserted between self and other
toLeft : bool, default=False
If true, join other strings to the left of self. By default,
other is joined to the right of self.
Returns
-------
Strings
The array of joined strings
Raises
------
TypeError
Raised if the delimiter parameter is not bytes or str_scalars
or if the other parameter is not a Strings instance
ValueError
Raised if times is < 1
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
lstick, peel, rpeel
Examples
--------
>>> import arkouda as ak
>>> s = ak.array(['a', 'c', 'e'])
>>> t = ak.array(['b', 'd', 'f'])
>>> s.stick(t, delimiter='.')
array(['a.b', 'c.d', 'e.f'])
"""
from arkouda.client import generic_msg
if isinstance(delimiter, bytes):
delimiter = delimiter.decode()
rep_msg = generic_msg(
cmd="segmentedBinopvv",
args={
"op": "stick",
"objType": self.objType,
"obj": self.entry,
"otherType": other.objType,
"other": other.entry,
"left": NUMBER_FORMAT_STRINGS["bool"].format(toLeft),
"delim": delimiter,
},
)
return Strings.from_return_msg(cast(str, rep_msg))
def __add__(self, other: Strings) -> Strings:
return self.stick(other)
[docs]
def lstick(self, other: Strings, delimiter: Union[bytes, str_scalars] = "") -> Strings:
"""
Join the strings from another array onto the left of the strings
of this array, optionally inserting a delimiter.
*Warning*: This function is experimental and not guaranteed to work.
Parameters
----------
other : Strings
The strings to join onto self's strings
delimiter : bytes or str_scalars, default=""
String inserted between self and other
Returns
-------
Strings
The array of joined strings, as other + self
Raises
------
TypeError
Raised if the delimiter parameter is neither bytes nor a str
or if the other parameter is not a Strings instance
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
stick, peel, rpeel
Examples
--------
>>> import arkouda as ak
>>> s = ak.array(['a', 'c', 'e'])
>>> t = ak.array(['b', 'd', 'f'])
>>> s.lstick(t, delimiter='.')
array(['b.a', 'd.c', 'f.e'])
"""
return self.stick(other, delimiter=delimiter, toLeft=True)
def __radd__(self, other: Strings) -> Strings:
return self.lstick(other)
[docs]
def get_prefixes(
self, n: int_scalars, return_origins: bool = True, proper: bool = True
) -> Union[Strings, Tuple[Strings, pdarray]]:
"""
Return the n-long prefix of each string, where possible.
Parameters
----------
n : int_scalars
Length of prefix
return_origins : bool, default=True
If True, return a logical index indicating which strings
were long enough to return an n-prefix
proper : bool, default=True
If True, only return proper prefixes, i.e. from strings
that are at least n+1 long. If False, allow the entire
string to be returned as a prefix.
Returns
-------
Union[Strings, Tuple[Strings, pdarray]]
prefixes : Strings
The array of n-character prefixes; the number of elements is the number of
True values in the returned mask.
origin_indices : pdarray, bool
Boolean array that is True where the string was long enough to return
an n-character prefix, False otherwise.
"""
from arkouda.client import generic_msg
rep_msg = cast(
str,
generic_msg(
cmd="segmentedSubstring",
args={
"objType": self.objType,
"name": self,
"nChars": n,
"returnOrigins": return_origins,
"kind": "prefixes",
"proper": proper,
},
),
)
if return_origins:
parts = rep_msg.split("+")
prefixes = Strings.from_return_msg("+".join(parts[:2]))
longenough = create_pdarray(parts[2])
return prefixes, cast(pdarray, longenough)
else:
return Strings.from_return_msg(rep_msg)
[docs]
def get_suffixes(
self, n: int_scalars, return_origins: bool = True, proper: bool = True
) -> Union[Strings, Tuple[Strings, pdarray]]:
"""
Return the n-long suffix of each string, where possible.
Parameters
----------
n : int_scalars
Length of suffix
return_origins : bool, default=True
If True, return a logical index indicating which strings
were long enough to return an n-suffix
proper : bool, default=True
If True, only return proper suffixes, i.e. from strings
that are at least n+1 long. If False, allow the entire
string to be returned as a suffix.
Returns
-------
Union[Strings, Tuple[Strings, pdarray]]
suffixes : Strings
The array of n-character suffixes; the number of elements is the number of
True values in the returned mask.
origin_indices : pdarray, bool
Boolean array that is True where the string was long enough to return
an n-character suffix, False otherwise.
"""
from arkouda.client import generic_msg
rep_msg = cast(
str,
generic_msg(
cmd="segmentedSubstring",
args={
"objType": self.objType,
"name": self,
"nChars": n,
"returnOrigins": return_origins,
"kind": "suffixes",
"proper": proper,
},
),
)
if return_origins:
parts = rep_msg.split("+")
suffixes = Strings.from_return_msg("+".join(parts[:2]))
longenough = create_pdarray(parts[2])
return suffixes, cast(pdarray, longenough)
else:
return Strings.from_return_msg(rep_msg)
[docs]
def hash(self) -> Tuple[pdarray, pdarray]:
"""
Compute a 128-bit hash of each string.
Returns
-------
Tuple[pdarray,pdarray]
A tuple of two int64 pdarrays. The ith hash value is the concatenation
of the ith values from each array.
Notes
-----
The implementation uses SipHash128, a fast and balanced hash function (used
by Python for dictionaries and sets). For realistic numbers of strings (up
to about 10**15), the probability of a collision between two 128-bit hash
values is negligible.
"""
from arkouda.client import generic_msg
# TODO fix this to return a single pdarray of hashes
rep_msg = generic_msg(cmd="segmentedHash", args={"objType": self.objType, "obj": self.entry})
h1, h2 = cast(str, rep_msg).split("+")
return create_pdarray(h1), create_pdarray(h2)
[docs]
def group(self) -> pdarray:
"""
Return the permutation that groups the array, placing equivalent
strings together. All instances of the same string are guaranteed to lie
in one contiguous block of the permuted array, but the blocks are not
necessarily ordered.
Returns
-------
pdarray
The permutation that groups the array by value
See Also
--------
GroupBy, unique
Notes
-----
If the arkouda server is compiled with "-sSegmentedString.useHash=true",
then arkouda uses 128-bit hash values to group strings, rather than sorting
the strings directly. This method is fast, but the resulting permutation
merely groups equivalent strings and does not sort them. If the "useHash"
parameter is false, then a full sort is performed.
Raises
------
RuntimeError
Raised if there is a server-side error in executing group request or
creating the pdarray encapsulating the return message
"""
from arkouda.client import generic_msg
return create_pdarray(
generic_msg(cmd="segmentedGroup", args={"objType": self.objType, "obj": self.entry})
)
def _get_grouping_keys(self) -> List[Strings]:
"""
Private method for generating grouping keys used by GroupBy.
API: this method must be defined by all groupable arrays, and it
must return a list of arrays that can be (co)argsorted.
"""
return [self]
[docs]
def flatten(self) -> Strings:
"""
Return a copy of the array collapsed into one dimension.
Returns
-------
A copy of the input array, flattened to one dimension.
Note
----
As multidimensional Strings are currently supported,
flatten on a Strings object will always return itself.
"""
return self
[docs]
def to_ndarray(self) -> np.ndarray:
"""
Convert the array to a np.ndarray, transferring array data from the
arkouda server to Python. If the array exceeds a built-in size limit,
a RuntimeError is raised.
Returns
-------
np.ndarray
A numpy ndarray with the same strings as this array
Notes
-----
The number of bytes in the array cannot exceed ``ak.client.maxTransferBytes``,
otherwise a ``RuntimeError`` will be raised. This is to protect the user
from overflowing the memory of the system on which the Python client
is running, under the assumption that the server is running on a
distributed system with much more memory than the client. The user
may override this limit by setting ak.client.maxTransferBytes to a larger
value, but proceed with caution.
See Also
--------
array()
tolist()
Examples
--------
>>> import arkouda as ak
>>> a = ak.array(["hello", "my", "world"])
>>> a.to_ndarray()
array(['hello', 'my', 'world'], dtype='<U5')
>>> type(a.to_ndarray())
<class 'numpy.ndarray'>
"""
# Get offsets and append total bytes for length calculation
npoffsets = np.hstack((self._comp_to_ndarray("offsets"), np.array([self.nbytes])))
# Get contents of strings (will error if too large)
npvalues = self._comp_to_ndarray("values")
# Compute lengths, discounting null terminators
lengths = np.diff(npoffsets) - 1
# Numpy dtype is based on max string length
dt = f"<U{lengths.max() if len(lengths) > 0 else 1}"
res = np.empty(self.size, dtype=dt)
# Form a string from each segment and store in numpy array
for i, (o, ln) in enumerate(zip(npoffsets, lengths)):
res[i] = np.str_(codecs.decode(b"".join(npvalues[o : o + ln])))
return res
[docs]
def tolist(self) -> List[str]:
"""
Convert the SegString to a list, transferring data from the
arkouda server to Python. If the SegString exceeds a built-in size limit,
a RuntimeError is raised.
Returns
-------
List[str]
A list with the same strings as this SegString
Notes
-----
The number of bytes in the array cannot exceed ``ak.client.maxTransferBytes``,
otherwise a ``RuntimeError`` will be raised. This is to protect the user
from overflowing the memory of the system on which the Python client
is running, under the assumption that the server is running on a
distributed system with much more memory than the client. The user
may override this limit by setting ak.client.maxTransferBytes to a larger
value, but proceed with caution.
See Also
--------
to_ndarray()
Examples
--------
>>> import arkouda as ak
>>> a = ak.array(["hello", "my", "world"])
>>> a.tolist()
['hello', 'my', 'world']
>>> type(a.tolist())
<class 'list'>
"""
return cast(List[str], self.to_ndarray().tolist())
def _comp_to_ndarray(self, comp: str) -> np.ndarray:
"""
Return a NumPy ndarray representing one component of the string structure.
Parameters
----------
comp : str
The strings component to request
Returns
-------
np.ndarray
A numpy ndarray with the same attributes and data as the pdarray
Raises
------
RuntimeError
Raised if there is a server-side error thrown, if the pdarray size
exceeds the built-in client.maxTransferBytes size limit, or if the bytes
received does not match expected number of bytes
Notes
-----
The number of bytes in the array cannot exceed ``client.maxTransferBytes``,
otherwise a ``RuntimeError`` will be raised. This is to protect the user
from overflowing the memory of the system on which the Python client
is running, under the assumption that the server is running on a
distributed system with much more memory than the client. The user
may override this limit by setting client.maxTransferBytes to a larger
value, but proceed with caution.
"""
from arkouda.client import generic_msg, maxTransferBytes
# Total number of bytes in the array data
array_bytes = (
self.size * arkouda.numpy.dtypes.dtype(arkouda.numpy.dtypes.int64).itemsize
if comp == "offsets"
else self.nbytes * arkouda.numpy.dtypes.dtype(arkouda.numpy.dtypes.uint8).itemsize
)
# Guard against overflowing client memory
if array_bytes > maxTransferBytes:
raise RuntimeError(
"Array exceeds allowed size for transfer. Increase ak.client.maxTransferBytes to allow"
)
# The reply from the server will be a bytes object
rep_msg = generic_msg(
cmd=CMD_TO_NDARRAY, args={"obj": self.entry, "comp": comp}, recv_binary=True
)
# Make sure the received data has the expected length
if len(rep_msg) != array_bytes:
raise RuntimeError(f"Expected {array_bytes} bytes but received {len(rep_msg)}")
# The server sends us native-endian bytes so we need to account for that.
# Since bytes are immutable, we need to copy the np array to be mutable
dt: np.dtype = np.dtype(np.int64) if comp == "offsets" else np.dtype(np.uint8)
if arkouda.numpy.dtypes.get_server_byteorder() == "big":
dt = dt.newbyteorder(">")
else:
dt = dt.newbyteorder("<")
return (
np.frombuffer(rep_msg.encode("utf_8"), dt).copy()
if isinstance(rep_msg, str)
else np.frombuffer(rep_msg, dt).copy()
)
[docs]
def astype(self, dtype: Union[np.dtype, str]) -> Union[pdarray, Strings]:
"""
Cast values of Strings object to provided dtype.
Parameters
----------
dtype: np.dtype or str
Dtype to cast to
Returns
-------
pdarray
An arkouda pdarray with values converted to the specified data type
Notes
-----
This is essentially shorthand for ak.cast(x, '<dtype>') where x is a pdarray.
"""
from arkouda.numpy import cast as akcast
return type_cast(Union[pdarray, Strings], akcast(self, dtype))
[docs]
def to_parquet(
self,
prefix_path: str,
dataset: str = "strings_array",
mode: Literal["truncate", "append"] = "truncate",
compression: Optional[Literal["snappy", "gzip", "brotli", "zstd", "lz4"]] = None,
) -> str:
"""
Save the Strings object to Parquet. The result is a collection of files,
one file per locale of the arkouda server, where each filename starts
with prefix_path. Each locale saves its chunk of the array to its
corresponding file.
Parameters
----------
prefix_path : str
Directory and filename prefix that all output files share
dataset : str, default="strings_array"
Name of the dataset to create in files (must not already exist)
mode : {"truncate", "append"}, default = "truncate"
By default, truncate (overwrite) output files, if they exist.
If 'append', attempt to create new dataset in existing files.
compression : {"snappy", "gzip", "brotli", "zstd", "lz4"}, optional
Sets the compression type used with Parquet files
Returns
-------
str
string message indicating result of save operation
Raises
------
RuntimeError
Raised if a server-side error is thrown saving the pdarray
Notes
-----
- The prefix_path must be visible to the arkouda server and the user must
have write permission.
- Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>``
ranges from 0 to ``numLocales`` for `file_type='distribute'`.
- 'append' write mode is supported, but is not efficient.
- If any of the output files already exist and
the mode is 'truncate', they will be overwritten. If the mode is 'append'
and the number of output files is less than the number of locales or a
dataset with the same name already exists, a ``RuntimeError`` will result.
- Any file extension can be used.The file I/O does not rely on the extension to
determine the file format.
"""
from arkouda.client import generic_msg
from arkouda.pandas.io import _mode_str_to_int
return cast(
str,
generic_msg(
"writeParquet",
{
"values": self.entry,
"dset": dataset,
"mode": _mode_str_to_int(mode),
"prefix": prefix_path,
"objType": "strings",
"dtype": self.dtype,
"compression": compression,
},
),
)
[docs]
def to_hdf(
self,
prefix_path: str,
dataset: str = "strings_array",
mode: Literal["truncate", "append"] = "truncate",
save_offsets: bool = True,
file_type: Literal["single", "distribute"] = "distribute",
) -> str:
"""
Save the Strings object to HDF5.
The object can be saved to a collection of files or single file.
Parameters
----------
prefix_path : str
Directory and filename prefix that all output files share
dataset : str, default="strings_array"
The name of the Strings dataset to be written, defaults to strings_array
mode : {"truncate", "append"}, default = "truncate"
By default, truncate (overwrite) output files, if they exist.
If 'append', create a new Strings dataset within existing files.
save_offsets : bool, default=True
Defaults to True which will instruct the server to save the offsets array to HDF5
If False the offsets array will not be save and will be derived from the string values
upon load/read.
file_type : {"single", "distribute"}, default = "distribute"
Default: Distribute
Distribute the dataset over a file per locale.
Single file will save the dataset to one file
Returns
-------
str
String message indicating result of save operation
Raises
------
RuntimeError
Raised if a server-side error is thrown saving the pdarray
Notes
-----
- Parquet files do not store the segments, only the values.
- Strings state is saved as two datasets within an hdf5 group:
one for the string characters and one for the
segments corresponding to the start of each string
- the hdf5 group is named via the dataset parameter.
- The prefix_path must be visible to the arkouda server and the user must
have write permission.
- Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>``
ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise,
the file name will be `prefix_path`.
- If any of the output files already exist and
the mode is 'truncate', they will be overwritten. If the mode is 'append'
and the number of output files is less than the number of locales or a
dataset with the same name already exists, a ``RuntimeError`` will result.
- Any file extension can be used.The file I/O does not rely on the extension to
determine the file format.
See Also
--------
to_hdf
"""
from arkouda.client import generic_msg
from arkouda.pandas.io import _file_type_to_int, _mode_str_to_int
return cast(
str,
generic_msg(
"tohdf",
{
"values": self.entry,
"dset": dataset,
"write_mode": _mode_str_to_int(mode),
"filename": prefix_path,
"dtype": self.dtype,
"save_offsets": save_offsets,
"objType": "strings",
"file_format": _file_type_to_int(file_type),
},
),
)
[docs]
def update_hdf(
self,
prefix_path: str,
dataset: str = "strings_array",
save_offsets: bool = True,
repack: bool = True,
) -> str:
"""
Overwrite the dataset with the name provided with this Strings object.
If the dataset does not exist it is added.
Parameters
----------
prefix_path : str
Directory and filename prefix that all output files share
dataset : str, default="strings_array"
Name of the dataset to create in files
save_offsets : bool, default=True
Defaults to True which will instruct the server to save the offsets array to HDF5
If False the offsets array will not be save and will be derived from the string values
upon load/read.
repack : bool, default=True
Default: True
HDF5 does not release memory on delete. When True, the inaccessible
data (that was overwritten) is removed. When False, the data remains, but is
inaccessible. Setting to false will yield better performance, but will cause
file sizes to expand.
Returns
-------
str
success message if successful
Raises
------
RuntimeError
Raised if a server-side error is thrown saving the Strings object
Notes
-----
- If file does not contain File_Format attribute to indicate how it was saved,
the file name is checked for _LOCALE#### to determine if it is distributed.
- If the dataset provided does not exist, it will be added
"""
from arkouda.client import generic_msg
from arkouda.pandas.io import (
_file_type_to_int,
_get_hdf_filetype,
_mode_str_to_int,
_repack_hdf,
)
# determine the format (single/distribute) that the file was saved in
file_type = _get_hdf_filetype(prefix_path + "*")
msg = generic_msg(
cmd="tohdf",
args={
"values": self,
"dset": dataset,
"write_mode": _mode_str_to_int("append"),
"filename": prefix_path,
"dtype": self.dtype,
"save_offsets": save_offsets,
"objType": "strings",
"file_format": _file_type_to_int(file_type),
"overwrite": True,
},
)
if repack:
_repack_hdf(prefix_path)
return cast(str, msg)
[docs]
@typechecked
def to_csv(
self,
prefix_path: str,
dataset: str = "strings_array",
col_delim: str = ",",
overwrite: bool = False,
) -> str:
r"""
Write Strings to CSV file(s). File will contain a single column with the Strings data.
All CSV Files written by Arkouda include a header denoting data types of the columns.
Unlike other file formats, CSV files store Strings as their UTF-8 format instead of storing
bytes as uint(8).
Parameters
----------
prefix_path: str
The filename prefix to be used for saving files. Files will have _LOCALE#### appended
when they are written to disk.
dataset : str, default="strings_array"
Column name to save the Strings under. Defaults to "strings_array".
col_delim : str, default=","
Defaults to ",". Value to be used to separate columns within the file.
Please be sure that the value used DOES NOT appear in your dataset.
overwrite : bool, default=False
Defaults to False. If True, any existing files matching your provided prefix_path will
be overwritten. If False, an error will be returned if existing files are found.
Returns
-------
str
response message
Raises
------
ValueError
Raised if all datasets are not present in all parquet files or if one or
more of the specified files do not exist
RuntimeError
Raised if one or more of the specified files cannot be opened.
If `allow_errors` is true this may be raised if no values are returned
from the server.
TypeError
Raised if we receive an unknown arkouda_type returned from the server
Notes
-----
- CSV format is not currently supported by load/load_all operations
- The column delimiter is expected to be the same for column names and data
- Be sure that column delimiters are not found within your data.
- All CSV files must delimit rows using newline (``\\n``) at this time.
"""
from arkouda.client import generic_msg
return cast(
str,
generic_msg(
cmd="writecsv",
args={
"datasets": [self],
"col_names": [dataset],
"filename": prefix_path,
"num_dsets": 1,
"col_delim": col_delim,
"dtypes": [self.dtype.name],
"row_count": self.size,
"overwrite": overwrite,
},
),
)
def _list_component_names(self) -> List[str]:
"""
Return a list of all component names.
Returns
-------
List[str]
List of all component names
"""
return list(itertools.chain.from_iterable([self.entry._list_component_names()]))
[docs]
def info(self) -> str:
"""
Return a JSON formatted string containing information about all components of self.
Returns
-------
str
JSON string containing information about all components of self
"""
return information(self._list_component_names())
[docs]
def pretty_print_info(self) -> None:
"""Print information about all components of self in a human readable format."""
self.entry.pretty_print_info()
[docs]
@typechecked
def register(self, user_defined_name: str) -> Strings:
"""
Register this Strings object with a user defined name in the arkouda server
so it can be attached to later using Strings.attach().
This is an in-place operation, registering a Strings object more than once will
update the name in the registry and remove the previously registered name.
A name can only be registered to one object at a time.
Parameters
----------
user_defined_name : str
user defined name which the Strings object is to be registered under
Returns
-------
Strings
The same Strings object which is now registered with the arkouda server and
has an updated name.
This is an in-place modification, the original is returned to support a
fluid programming style.
Please note you cannot register two different objects with the same name.
Raises
------
TypeError
Raised if user_defined_name is not a str
RegistrationError
If the server was unable to register the Strings object with the user_defined_name
If the user is attempting to register more than one object with the same name,
the former should be unregistered first to free up the registration name.
See Also
--------
attach, unregister
Notes
-----
Registered names/Strings objects in the server are immune to deletion
until they are unregistered.
"""
from arkouda.client import generic_msg
if self.registered_name is not None and self.is_registered():
raise RegistrationError(f"This object is already registered as {self.registered_name}")
generic_msg(
cmd="register",
args={
"name": user_defined_name,
"objType": self.objType,
"array": self.name,
},
)
self.registered_name = user_defined_name
return self
[docs]
def unregister(self) -> None:
"""
Unregister a Strings object in the arkouda server which was previously
registered using register() and/or attached to using attach().
Raises
------
RuntimeError
Raised if the server could not find the internal name/symbol to remove
See Also
--------
register, attach
Notes
-----
Registered names/Strings objects in the server are immune to deletion until
they are unregistered.
"""
from arkouda.numpy.util import unregister
if not self.registered_name:
raise RegistrationError("This object is not registered")
unregister(self.registered_name)
self.registered_name = None
[docs]
def is_registered(self) -> np.bool_:
"""
Return True iff the object is contained in the registry.
Returns
-------
bool
Indicates if the object is contained in the registry
Raises
------
RuntimeError
Raised if there's a server-side error thrown
"""
from arkouda.numpy.util import is_registered
if self.registered_name is None:
return np.bool_(is_registered(self.name, as_component=True))
else:
return np.bool_(is_registered(self.registered_name))
[docs]
def transfer(self, hostname: str, port: int_scalars) -> Union[str, memoryview]:
"""
Send a Strings object to a different Arkouda server.
Parameters
----------
hostname : str
The hostname where the Arkouda server intended to
receive the Strings object is running.
port : int_scalars
The port to send the array over. This needs to be an
open port (i.e., not one that the Arkouda server is
running on). This will open up `numLocales` ports,
each of which in succession, so will use ports of the
range {port..(port+numLocales)} (e.g., running an
Arkouda server of 4 nodes, port 1234 is passed as
`port`, Arkouda will use ports 1234, 1235, 1236,
and 1237 to send the array data).
This port much match the port passed to the call to
`ak.receive_array()`.
Returns
-------
str
A message indicating a complete transfer
Raises
------
ValueError
Raised if the op is not within the pdarray.BinOps set
TypeError
Raised if other is not a pdarray or the pdarray.dtype is not
a supported dtype
"""
from arkouda.client import generic_msg
# hostname is the hostname to send to
return generic_msg(
cmd="sendArray",
args={"values": self.entry, "hostname": hostname, "port": port, "objType": "strings"},
)
[docs]
@staticmethod
def concatenate_uniquely(strings: List[Strings]) -> Strings:
"""
Concatenates a list of Strings into a single Strings object
containing only unique strings. Order may not be preserved.
Parameters
----------
strings : List[Strings]
List of segmented string objects to concatenate.
Returns
-------
Strings
A new Strings object containing the unique values.
"""
from arkouda.client import generic_msg
if not strings:
raise ValueError("Must provide at least one Strings object")
# Extract name of each SegmentedString
names = [s.name for s in strings]
# Send the command to the server
rep_msg = generic_msg(
cmd="concatenateUniquely",
args={
"names": names,
},
)
return Strings.from_return_msg(cast(str, rep_msg))
[docs]
def argsort(
self,
algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD,
ascending: bool = True,
) -> pdarray:
"""
Return the permutation that sorts the Strings.
Parameters
----------
algorithm : SortingAlgorithm, default SortingAlgorithm.RadixSortLSD
The algorithm to use for sorting.
ascending : bool, default True
Whether to sort in ascending order.
Returns
-------
pdarray
The indices that sort the Strings.
"""
from arkouda.client import generic_msg
from arkouda.numpy.manipulation_functions import flip
from arkouda.numpy.pdarraycreation import zeros
if self.size == 0:
return zeros(0, dtype=akint64) # Strings always maps to int64 indices
rep_msg = generic_msg(
cmd="argsortStrings",
args={
"name": self.entry.name,
"algoName": algorithm.name,
},
)
sorted_array = create_pdarray(cast(str, rep_msg))
return sorted_array if ascending else flip(sorted_array)
[docs]
def take(self, indices: Union[numeric_scalars, pdarray], axis: Optional[int] = None) -> Strings:
"""
Take elements from the array along an axis.
When axis is not None, this function does the same thing as “fancy” indexing (indexing arrays
using arrays); however, it can be easier to use if you need elements along a given axis.
A call such as ``np.take(arr, indices, axis=3)`` is equivalent to ``arr[:,:,:,indices,...]``.
Parameters
----------
indices : numeric_scalars or pdarray
The indices of the values to extract. Also allow scalars for indices.
axis : int, optional
The axis over which to select values. By default, the flattened input array is used.
Returns
-------
Strings
A Strings containing the selected elements.
Examples
--------
>>> import arkouda as ak
>>> a = ak.array(["a","b","c"])
>>> indices = [0, 1]
>>> a.take(indices)
array(['a', 'b'])
"""
from arkouda.numpy.numeric import take
from arkouda.numpy.pdarraycreation import arange
idx = arange(self.size)
return self[take(idx, indices=indices, axis=axis)]