SegmentedString

Usage

use SegmentedString;

or

import SegmentedString;
const ssLogger = new Logger(logLevel, logChannel)
param SegmentedStringUseHash = useHash
enum Fixes { prefixes, suffixes }
enum constant prefixes
enum constant suffixes
config const NULL_STRINGS_VALUE = 0 : uint(8)
proc getSegString(name: string, st: borrowed SymTab) : owned SegString throws
proc getSegString(segments: [] int, values: [] uint(8), st: borrowed SymTab) : owned SegString throws
  • This version of the getSegString method takes segments and values arrays as

  • inputs, generates the SymEntry objects for each and passes the

  • offset and value SymTab lookup names to the alternate init method

proc assembleSegStringFromParts(offsets: GenSymEntry, values: GenSymEntry, st: borrowed SymTab) : owned SegString throws
proc assembleSegStringFromParts(offsets: SymEntry(int), values: SymEntry(uint(8)), st: borrowed SymTab) : owned SegString throws
class SegString
  • Represents an array of strings, implemented as a segmented array of bytes.

  • Instances are ephemeral, not stored in the symbol table. Instead, attributes

  • of this class refer to symbol table entries that persist. This class is a

  • convenience for bundling those persistent objects and defining string-relevant

  • operations.

var name : string
var composite : borrowed SegStringSymEntry
var offsets : shared SymEntry(int, 1)
  • The pdarray containing the offsets, which are the start indices of

  • the bytearrays, each of which corresponds to an individual string.

var values : shared SymEntry(uint(8), 1)
  • The pdarray containing the complete byte array composed of bytes

  • corresponding to each string, joined by nulls. Note: the null byte

  • is uint(8) value of zero.

var size : int
  • The number of strings in the segmented array

var nBytes : int
  • The total number of bytes in the entire segmented array including

  • the bytes corresonding to the strings as well as the nulls

  • separating the string bytes.

proc init(entryName: string, entry: borrowed SegStringSymEntry)
  • This method should not be called directly. Instead, call one of the

  • getSegString factory methods.

proc show(n: int = 3) throws
proc this(idx: ?t) : string throws  where t == int || t == uint

Retrieve one string from the array

proc this(const slice: range()) throws

Take a slice of strings from the array. The slice must be a Chapel range, i.e. low..high by stride, not a Python slice. Returns arrays for the segment offsets and bytes of the slice.

proc this(const slice: range(strides = strideKind.any)) throws
proc this(iv: [?D] ?t) throws  where t == int || t == uint

Gather strings by index. Returns arrays for the segment offsets and bytes of the gathered strings.

proc this(iv: [?D] bool) throws

Logical indexing (compress) of strings.

proc siphash() throws

Apply a hash function to all strings. This is useful for grouping and set membership. The hash used is SipHash128.

proc argGroup() throws

Return a permutation that groups the strings. Because hashing is used, this permutation will not sort the strings, but all equivalent strings will fall in one contiguous block.

proc getLengths() throws

Return lengths of all strings, including null terminator.

proc lower() throws

Given a SegString, return a new SegString with all uppercase characters from the original replaced with their lowercase equivalent :returns: Strings – Substrings with uppercase characters replaced with lowercase equivalent

proc upper() throws

Given a SegString, return a new SegString with all lowercase characters from the original replaced with their uppercase equivalent :returns: Strings – Substrings with lowercase characters replaced with uppercase equivalent

proc title() throws

Given a SegString, return a new SegString with first character of each original element replaced with its uppercase equivalent and the remaining characters replaced with their lowercase equivalent. The first character following a space character will be uppercase. :returns: Strings – Substrings with first characters replaced with uppercase equivalent and remaining characters replaced with their lowercase equivalent. The first character following a space character will be uppercase.

proc isDecimal() throws

Returns list of bools where index i indicates whether the string i of the SegString is a decimal :returns: [domain] bool where index i indicates whether the string i of the SegString is a decimal

proc capitalize() throws

Given a SegString, return a new SegString with first character of each original element replaced with its uppercase equivalent and the remaining characters replaced with their lowercase equivalent :returns: Strings – Substrings with first characters replaced with uppercase equivalent and remaining characters replaced with their lowercase equivalent

proc isLower() throws

Returns list of bools where index i indicates whether the string i of the SegString is entirely lowercase :returns: [domain] bool where index i indicates whether the string i of the SegString is entirely lowercase

proc isUpper() throws

Returns list of bools where index i indicates whether the string i of the SegString is entirely uppercase :returns: [domain] bool where index i indicates whether the string i of the SegString is entirely uppercase

proc isTitle() throws

Returns list of bools where index i indicates whether the string i of the SegString is titlecase :returns: [domain] bool where index i indicates whether the string i of the SegString is titlecase

proc isalnum() throws

Returns list of bools where index i indicates whether the string i of the SegString is alphanumeric :returns: [domain] bool where index i indicates whether the string i of the SegString is alphanumeric

proc isalpha() throws

Returns list of bools where index i indicates whether the string i of the SegString is alphabetic :returns: [domain] bool where index i indicates whether the string i of the SegString is alphabetic

proc isdigit() throws

Returns list of bools where index i indicates whether the string i of the SegString is digits :returns: [domain] bool where index i indicates whether the string i of the SegString is digits

proc isempty() throws

Returns list of bools where index i indicates whether the string i of the SegString is empty :returns: [domain] bool where index i indicates whether the string i of the SegString is empty

proc isspace() throws

Returns list of bools where index i indicates whether the string i of the SegString is whitespace :returns: [domain] bool where index i indicates whether the string i of the SegString is whitespace

proc bytesToUintArr(const max_bytes: int, lens: [?D] ?t, st) throws
proc findSubstringInBytes(const substr: string) throws
proc findMatchLocations(const pattern: string, groupNum: int) throws

Given a SegString, finds pattern matches and returns pdarrays containing the number, start postitions, and lengths of matches :arg pattern: The regex pattern used to find matches :type pattern: string :arg groupNum: The number of the capture group to be returned :type groupNum: int :returns: int64 pdarray – For each original string, the number of pattern matches and int64 pdarray – The start positons of pattern matches and int64 pdarray – The lengths of pattern matches

proc findAllMatches(const numMatchesEntry: ?t, const startsEntry: borrowed SymEntry(int, 1), const lensEntry: borrowed SymEntry(int, 1), const indicesEntry: borrowed SymEntry(int, 1), const returnMatchOrig: bool) throws  where t == borrowed SymEntry(int, 1) || t == borrowed SymEntry(bool, 1)

Given a SegString, return a new SegString only containing matches of the regex pattern, If returnMatchOrig is set to True, return a pdarray containing the index of the original string each pattern match is from :arg numMatchesEntry: For each string in SegString, the number of pattern matches :type numMatchesEntry: borrowed SymEntry(int) or borrowed SysmEntry(bool) :arg startsEntry: The starting postions of pattern matches :type startsEntry: borrowed SymEntry(int) :arg lensEntry: The lengths of pattern matches :type lensEntry: borrowed SymEntry(int) :arg returnMatchOrig: If True, return a pdarray containing the index of the original string each pattern match is from :type returnMatchOrig: bool :returns: Strings – Only the portions of Strings which match pattern and (optional) int64 pdarray – For each pattern match, the index of the original string it was in

proc sub(pattern: string, replStr: string, initCount: int, returnNumSubs: bool) throws

Substitute pattern matches with repl. If count is nonzero, at most count substitutions occur If returnNumSubs is set to True, the number of substitutions per string will be returned

Arguments:
  • pattern : string – regex pattern used to find matches

  • replStr : string – the string to replace pattern matches with

  • initCount : int – If count is nonzero, at most count splits occur. If zero, substitute all occurences of pattern

  • returnNumSubs : bool – If True, also return the number of substitutions per string

Returns:

Strings – Substrings with pattern matches substituted and (optional) int64 pdarray – For each original string, the number of susbstitutions

proc segStrWhere(otherStr: ?t, condition: [] bool, ref newLens: [] int) throws  where t == string
proc segStrWhere(other: ?t, condition: [] bool, ref newLens: [] int) throws  where t == owned SegString
proc strip(chars: string) throws

Strip out all of the leading and trailing characters of each element of a segstring that are called out in the “chars” argument.

Arguments:

chars : string – the set of characters to be removed

Returns:

Strings – substrings with stripped characters from the original string and the offsets into those substrings

proc substringSearch(const pattern: string) throws

Returns list of bools where index i indicates whether the regular expression, pattern, matched string i of the SegString

Note: the regular expression engine used, re2, does not support lookahead/lookbehind

Arguments:

pattern : string – regex pattern to be applied to strings in SegString

Returns:

[domain] bool where index i indicates whether the regular expression, pattern, matched string i of the SegString

proc peelRegex(const delimiter: string, const times: int, const includeDelimiter: bool, const keepPartial: bool, const left: bool) throws

Peel off one or more fields matching the regular expression, delimiter, from each string (similar to string.partition), returning two new arrays of strings. Warning: This function is experimental and not guaranteed to work.

Note: the regular expression engine used, re2, does not support lookahead/lookbehind

Arguments:
  • delimter : string – regex delimter where the split in SegString will occur

  • times : int – The number of times the delimiter is sought, i.e. skip over the first (times-1) delimiters

  • includeDelimiter : bool – If true, append the delimiter to the end of the first return array By default, it is prepended to the beginning of the second return array.

  • keepPartial : bool – If true, a string that does not contain <times> instances of the delimiter will be returned in the first array. By default, such strings are returned in the second array.

  • left : bool – If true, peel from the left

Returns:

Components to build 2 SegStrings (leftOffsets, leftVals, rightOffsets, rightVals)

proc peel(const delimiter: string, const times: int, param includeDelimiter: bool, param keepPartial: bool, param left: bool) throws
proc stick(other: SegString, delim: string, param right: bool) throws
proc ediff() : [offsets.a.domain] int throws
proc isSorted() : bool throws
proc argsort(checkSorted: bool = false) : [offsets.a.domain] int throws
proc getFixes(n: int, kind: Fixes, proper: bool) throws
proc memcmp(const ref x: [] uint(8), const xinds, const ref y: [] uint(8), const yinds) : int
operator ==(lss: SegString, rss: SegString) throws

Test for equality between two same-length arrays of strings. Returns a boolean vector of the same length.

operator !=(lss: SegString, rss: SegString) throws

Test for inequality between two same-length arrays of strings. Returns a boolean vector of the same length.

operator ==(ss: SegString, testStr: string) throws

Test an array of strings for equality against a constant string. Return a boolean vector the same size as the array.

operator !=(ss: SegString, testStr: string) throws

Test an array of strings for inequality against a constant string. Return a boolean vector the same size as the array.

proc stringCompareLiteralEq(ref values, rng, testStr)
proc stringCompareLiteralNeq(ref values, rng, testStr)
proc compare(ss: SegString, const testStr: string, param function: SegFunction) throws

Element-wise comparison of an arrays of string against a target string. The polarity parameter determines whether the comparison checks for equality (polarity=true, result is true where elements equal target) or inequality (polarity=false, result is true where elements differ from target).

proc checkCompile(const pattern: ?t) throws  where t == bytes || t == string

Returns Regexp.compile if pattern can be compiled without an error

proc unsafeCompileRegex(const pattern: ?t)  where t == bytes || t == string
proc stringSearch(ref values, rng, myRegex) throws
proc stringIsLower(ref values, rng) throws

The SegFunction called by computeOnSegments for isLower

proc stringIsUpper(ref values, rng) throws

The SegFunction called by computeOnSegments for isUpper

proc stringIsTitle(ref values, rng) throws

The SegFunction called by computeOnSegments for isTitle

proc stringIsAlphaNumeric(ref values, rng) throws

The SegFunction called by computeOnSegments for isalnum

proc stringIsAlphabetic(ref values, rng) throws

The SegFunction called by computeOnSegments for isalpha

proc stringIsDecimal(ref values, rng) throws

The SegFunction called by computeOnSegments for isdecimal, using isDigit

proc stringIsDigit(ref values, rng) throws

The SegFunction called by computeOnSegments for isdigit

proc stringIsEmpty(ref values, rng) throws

The SegFunction called by computeOnSegments for isempty

proc stringIsSpace(ref values, rng) throws

The SegFunction called by computeOnSegments for isspace

proc stringBytesToUintArr(ref values, rng) throws
proc in1d(mainStr: SegString, testStr: SegString, invert = false) throws  where useHash

Test array of strings for membership in another array (set) of strings. Returns a boolean vector the same size as the first array.

proc concat(s1: [] int, v1: [] uint(8), s2: [] int, v2: [] uint(8)) throws
proc in1d(mainStr: SegString, testStr: SegString, invert = false) throws  where !useHash
proc segStrFull(arrSize: int, fillValue: string) throws
proc interpretAsString(ref bytearray: [?D] uint(8), region: range(?), borrow = false) : string

Interpret a region of a byte array as a Chapel string. If borrow=false a new string is returned, otherwise the string borrows memory from the array (reduces memory allocations if the string isn’t needed after array)

proc interpretAsBytes(ref bytearray: [?D] uint(8), region: range(?), borrow = false) : bytes

Interpret a region of a byte array as bytes. Modeled after interpretAsString