import json
from enum import Enum
from typing import cast
from arkouda.client import generic_msg
from arkouda.pdarrayclass import create_pdarray, pdarray
__all__ = ["Match"]
MatchType = Enum("MatchType", ["SEARCH", "MATCH", "FULLMATCH"])
[docs]
class Match:
def __init__(
self,
matched: pdarray,
starts: pdarray,
lengths: pdarray,
indices: pdarray,
parent_entry_name: str,
match_type: MatchType,
pattern: str,
):
self._objtype = type(self).__name__
self._parent_entry_name = parent_entry_name
self._match_type = match_type
self._matched = matched
self._starts = starts
self._lengths = lengths
self._ends = starts + lengths
self._indices = indices
self._parent_obj: object = None
self.re = pattern
def __str__(self):
from arkouda.client import pdarrayIterThresh
if self._matched.size <= pdarrayIterThresh:
vals = [self.__getitem__(i) for i in range(self._matched.size)]
else:
vals = [self.__getitem__(i) for i in range(3)]
vals.append("... ")
vals.extend([self.__getitem__(i) for i in range(self._matched.size - 3, self._matched.size)])
return f"<ak.{self._objtype} object: {'; '.join(vals)}>"
def __getitem__(self, item):
return (
f"matched={self._matched[item]}, span=({self._starts[self._indices[item]]}"
f", {self._ends[self._indices[item]]})"
if self._matched[item]
else f"matched={self._matched[item]}"
)
def __repr__(self):
return self.__str__()
[docs]
def matched(self) -> pdarray:
"""
Returns a boolean array indiciating whether each element matched
Returns
-------
pdarray, bool
True for elements that match, False otherwise
Examples
--------
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.search('_+').matched()
array([True True False True False])
"""
return self._matched
[docs]
def start(self) -> pdarray:
"""
Returns the starts of matches
Returns
-------
pdarray, int64
The start positions of matches
Examples
--------
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.search('_+').start()
array([1 0 0])
"""
return self._starts
[docs]
def end(self) -> pdarray:
"""
Returns the ends of matches
Returns
-------
pdarray, int64
The end positions of matches
Examples
--------
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.search('_+').end()
array([2 4 2])
"""
return self._ends
[docs]
def match_type(self) -> str:
"""
Returns the type of the Match object
Returns
-------
str
MatchType of the Match object
Examples
--------
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.search('_+').match_type()
'SEARCH'
"""
return self._match_type.name
[docs]
def find_matches(self, return_match_origins: bool = False):
"""
Return all matches as a new Strings object
Parameters
----------
return_match_origins: bool
If True, return a pdarray containing the index of the original string each pattern
match is from
Returns
-------
Strings
Strings object containing only matches
pdarray, int64 (optional)
The index of the original string each pattern match is from
Raises
------
RuntimeError
Raised if there is a server-side error thrown
Examples
--------
>>> strings = ak.array(['1_2___', '____', '3', '__4___5____6___7', ''])
>>> strings.search('_+').find_matches(return_match_origins=True)
(array(['_', '____', '__']), array([0 1 3]))
"""
from arkouda.strings import Strings
repMsg = cast(
str,
generic_msg(
cmd="segmentedFindAll",
args={
"objType": self._objtype,
"parent_name": self._parent_entry_name,
"num_matches": self._matched,
"starts": self._starts,
"lengths": self._lengths,
"indices": self._indices,
"rtn_origins": return_match_origins,
},
),
)
if return_match_origins:
arrays = repMsg.split("+", maxsplit=2)
return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
else:
return Strings.from_return_msg(repMsg)
[docs]
def group(self, group_num: int = 0, return_group_origins: bool = False):
"""
Returns a new Strings containing the capture group corresponding to group_num.
For the default, group_num=0, return the full match
Parameters
----------
group_num: int
The index of the capture group to be returned
return_group_origins: bool
If True, return a pdarray containing the index of the original string each
capture group is from
Returns
-------
Strings
Strings object containing only the capture groups corresponding to group_num
pdarray, int64 (optional)
The index of the original string each group is from
Examples
--------
>>> strings = ak.array(["Isaac Newton, physics", '<-calculus->', 'Gottfried Leibniz, math'])
>>> m = strings.search("(\\w+) (\\w+)")
>>> m.group()
array(['Isaac Newton', 'Gottfried Leibniz'])
>>> m.group(1)
array(['Isaac', 'Gottfried'])
>>> m.group(2, return_group_origins=True)
(array(['Newton', 'Leibniz']), array([0 2]))
"""
from arkouda.client import regexMaxCaptures
from arkouda.strings import Strings
if group_num < 0:
raise ValueError("group_num cannot be negative")
if group_num > regexMaxCaptures:
max_capture_flag = f"-e REGEX_MAX_CAPTURES={group_num}"
e = (
f"group_num={group_num} > regexMaxCaptures={regexMaxCaptures}."
f" To run group({group_num}), recompile the server with flag '{max_capture_flag}'"
)
raise ValueError(e)
# We don't cache the locations of groups, find the location info and call findAll
repMsg = cast(
str,
generic_msg(
cmd="segmentedFindLoc",
args={
"objType": self._objtype,
"parent_name": self._parent_entry_name,
"groupNum": group_num,
"pattern": self.re,
},
),
)
created_map = json.loads(repMsg)
global_starts = create_pdarray(created_map["Starts"])
global_lengths = create_pdarray(created_map["Lens"])
global_indices = create_pdarray(created_map["Indices"])
if self._match_type == MatchType.SEARCH:
matched = create_pdarray(created_map["SearchBool"])
indices = create_pdarray(created_map["SearchInd"])
elif self._match_type == MatchType.MATCH:
matched = create_pdarray(created_map["MatchBool"])
indices = create_pdarray(created_map["MatchInd"])
elif self._match_type == MatchType.FULLMATCH:
matched = create_pdarray(created_map["FullMatchBool"])
indices = create_pdarray(created_map["FullMatchInd"])
else:
raise ValueError(f"{self._match_type} is not a MatchType")
starts = global_starts[global_indices[matched]]
lengths = global_lengths[global_indices[matched]]
repMsg = cast(
str,
generic_msg(
cmd="segmentedFindAll",
args={
"objType": self._objtype,
"parent_name": self._parent_entry_name,
"num_matches": matched,
"starts": starts,
"lengths": lengths,
"indices": indices,
"rtn_origins": return_group_origins,
},
),
)
if return_group_origins:
arrays = repMsg.split("+", maxsplit=2)
return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
else:
return Strings.from_return_msg(repMsg)