""" Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ {{py: # name complex_types = ['complex64', 'complex128'] }} {{for name in complex_types}} cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: cdef kh{{name}}_t res res.real = val.real res.imag = val.imag return res {{endfor}} {{py: # name c_types = ['khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t', 'int64_t', 'int32_t', 'int16_t', 'int8_t', 'uint64_t', 'uint32_t', 'uint16_t', 'uint8_t'] }} {{for c_type in c_types}} cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }} return val.real != val.real or val.imag != val.imag {{elif c_type in {'float64_t', 'float32_t'} }} return val != val {{else}} return False {{endif}} {{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }} # are_equivalent_{{c_type}} is cimported via khash.pxd {{else}} cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil: return val1 == val2 {{endif}} {{endfor}} {{py: # name cimported_types = ['complex64', 'complex128', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'pymap', 'str', 'strbox', 'uint8', 'uint16', 'uint32', 'uint64'] }} {{for name in cimported_types}} from pandas._libs.khash cimport ( kh_destroy_{{name}}, kh_exist_{{name}}, kh_get_{{name}}, kh_init_{{name}}, kh_put_{{name}}, kh_resize_{{name}}, ) {{endfor}} # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA {{py: # name, dtype, c_type # the generated StringVector is not actually used # but is included for completeness (rather ObjectVector is used # for uniques in hashtables) dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), ('Complex64', 'complex64', 'khcomplex64_t'), ('Float64', 'float64', 'float64_t'), ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), ('Int16', 'int16', 'int16_t'), ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), ('UInt64', 'uint64', 'uint64_t'), ('UInt32', 'uint32', 'uint32_t'), ('UInt16', 'uint16', 'uint16_t'), ('UInt8', 'uint8', 'uint8_t')] }} {{for name, dtype, c_type in dtypes}} {{if dtype != 'int64'}} # Int64VectorData is defined in the .pxd file because it is needed (indirectly) # by IntervalTree ctypedef struct {{name}}VectorData: {{c_type}} *data Py_ssize_t n, m {{endif}} @cython.wraparound(False) @cython.boundscheck(False) cdef inline void append_data_{{dtype}}({{name}}VectorData *data, {{c_type}} x) nogil: data.data[data.n] = x data.n += 1 {{endfor}} ctypedef fused vector_data: Int64VectorData Int32VectorData Int16VectorData Int8VectorData UInt64VectorData UInt32VectorData UInt16VectorData UInt8VectorData Float64VectorData Float32VectorData Complex128VectorData Complex64VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: return data.n == data.m # ---------------------------------------------------------------------- # Vector # ---------------------------------------------------------------------- cdef class Vector: # cdef readonly: # bint external_view_exists def __cinit__(self): self.external_view_exists = False {{py: # name, dtype, c_type dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), ('Complex64', 'complex64', 'khcomplex64_t'), ('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), ('Float32', 'float32', 'float32_t'), ('UInt32', 'uint32', 'uint32_t'), ('Int32', 'int32', 'int32_t'), ('UInt16', 'uint16', 'uint16_t'), ('Int16', 'int16', 'int16_t'), ('UInt8', 'uint8', 'uint8_t'), ('Int8', 'int8', 'int8_t')] }} {{for name, dtype, c_type in dtypes}} cdef class {{name}}Vector(Vector): # For int64 we have to put this declaration in the .pxd file; # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: {{name}}VectorData *data ndarray ao {{endif}} def __cinit__(self): self.data = <{{name}}VectorData *>PyMem_Malloc( sizeof({{name}}VectorData)) if not self.data: raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data def __dealloc__(self): if self.data is not NULL: PyMem_Free(self.data) self.data = NULL def __len__(self) -> int: return self.data.n cpdef ndarray to_array(self): if self.data.m != self.data.n: if self.external_view_exists: # should never happen raise ValueError("should have raised on append()") self.ao.resize(self.data.n, refcheck=False) self.data.m = self.data.n self.external_view_exists = True return self.ao cdef inline void append(self, {{c_type}} x): if needs_resize(self.data): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize() append_data_{{dtype}}(self.data, x) cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): self.append(x[i]) {{endfor}} cdef class StringVector(Vector): cdef: StringVectorData *data def __cinit__(self): self.data = PyMem_Malloc(sizeof(StringVectorData)) if not self.data: raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() cdef resize(self): cdef: char **orig_data Py_ssize_t i, m m = self.data.m self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) orig_data = self.data.data self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() for i in range(m): self.data.data[i] = orig_data[i] def __dealloc__(self): if self.data is not NULL: if self.data.data is not NULL: free(self.data.data) PyMem_Free(self.data) self.data = NULL def __len__(self) -> int: return self.data.n cpdef ndarray[object, ndim=1] to_array(self): cdef: ndarray ao Py_ssize_t n object val ao = np.empty(self.data.n, dtype=object) for i in range(self.data.n): val = self.data.data[i] ao[i] = val self.external_view_exists = True self.data.m = self.data.n return ao cdef inline void append(self, char *x): if needs_resize(self.data): self.resize() append_data_string(self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) cdef class ObjectVector(Vector): cdef: PyObject **data Py_ssize_t n, m ndarray ao def __cinit__(self): self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data def __len__(self) -> int: return self.n cdef inline append(self, object obj): if self.n == self.m: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.m = max(self.m * 2, _INIT_VEC_CAP) self.ao.resize(self.m, refcheck=False) self.data = self.ao.data Py_INCREF(obj) self.data[self.n] = obj self.n += 1 cpdef ndarray[object, ndim=1] to_array(self): if self.m != self.n: if self.external_view_exists: raise ValueError("should have raised on append()") self.ao.resize(self.n, refcheck=False) self.m = self.n self.external_view_exists = True return self.ao cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) # ---------------------------------------------------------------------- # HashTable # ---------------------------------------------------------------------- cdef class HashTable: pass {{py: # name, dtype, c_type, to_c_type dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), ('Float64', 'float64', 'float64_t', ''), ('UInt64', 'uint64', 'uint64_t', ''), ('Int64', 'int64', 'int64_t', ''), ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), ('Float32', 'float32', 'float32_t', ''), ('UInt32', 'uint32', 'uint32_t', ''), ('Int32', 'int32', 'int32_t', ''), ('UInt16', 'uint16', 'uint16_t', ''), ('Int16', 'int16', 'int16_t', ''), ('UInt8', 'uint8', 'uint8_t', ''), ('Int8', 'int8', 'int8_t', '')] }} {{for name, dtype, c_type, to_c_type in dtypes}} cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: return self.table.size def __dealloc__(self): if self.table is not NULL: kh_destroy_{{dtype}}(self.table) self.table = NULL def __contains__(self, object key) -> bool: cdef: khiter_t k {{c_type}} ckey ckey = {{to_c_type}}(key) k = kh_get_{{dtype}}(self.table, ckey) return k != self.table.n_buckets def sizeof(self, deep: bool = False) -> int: """ return the size of my table in bytes """ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs def get_state(self) -> dict[str, int]: """ returns infos about the state of the hashtable""" return { 'n_buckets' : self.table.n_buckets, 'size' : self.table.size, 'n_occupied' : self.table.n_occupied, 'upper_bound' : self.table.upper_bound, } cpdef get_item(self, {{dtype}}_t val): # Used in core.sorting, IndexEngine.get_loc cdef: khiter_t k {{c_type}} cval cval = {{to_c_type}}(val) k = kh_get_{{dtype}}(self.table, cval) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): # Used in libjoin cdef: khiter_t k int ret = 0 {{c_type}} ckey ckey = {{to_c_type}}(key) k = kh_put_{{dtype}}(self.table, ckey, &ret) if kh_exist_{{dtype}}(self.table, k): self.table.vals[k] = val else: raise KeyError(key) {{if dtype == "int64" }} # We only use this for int64, can reduce build size and make .pyi # more accurate by only implementing it for int64 @cython.boundscheck(False) def map_keys_to_values( self, const {{dtype}}_t[:] keys, const int64_t[:] values ) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 {{c_type}} key khiter_t k with nogil: for i in range(n): key = {{to_c_type}}(keys[i]) k = kh_put_{{dtype}}(self.table, key, &ret) self.table.vals[k] = values[i] {{endif}} @cython.boundscheck(False) def map_locations(self, const {{dtype}}_t[:] values) -> None: # Used in libindex, safe_sort cdef: Py_ssize_t i, n = len(values) int ret = 0 {{c_type}} val khiter_t k with nogil: for i in range(n): val= {{to_c_type}}(values[i]) k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i @cython.boundscheck(False) def lookup(self, const {{dtype}}_t[:] values) -> ndarray: # -> np.ndarray[np.intp] # Used in safe_sort, IndexEngine.get_indexer cdef: Py_ssize_t i, n = len(values) int ret = 0 {{c_type}} val khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) with nogil: for i in range(n): val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: locs[i] = -1 return np.asarray(locs) @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, object mask=None, bint return_inverse=False, bint use_result_mask=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. ignore_na : bool, default False Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. mask : ndarray[bool], optional If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. use_result_mask: bool, default False Whether to create a result mask for the unique values. Not supported with return_inverse=True. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques result_mask: ndarray[bool], if use_result_mask is true The mask for the result values. """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) intp_t[::1] labels int ret = 0 {{c_type}} val, na_value2 khiter_t k {{name}}VectorData *ud UInt8Vector result_mask UInt8VectorData *rmd bint use_na_value, use_mask, seen_na = False uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.intp) ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: raise NotImplementedError # pragma: no cover if use_result_mask and return_inverse: raise NotImplementedError # pragma: no cover result_mask = UInt8Vector() rmd = result_mask.data if use_mask: mask_values = mask.view("uint8") if use_na_value: # We need this na_value2 because we want to allow users # to *optionally* specify an NA sentinel *of the correct* type. # We use None, to make it optional, which requires `object` type # for the parameter. To please the compiler, we use na_value2, # which is only used if it's *specified*. na_value2 = {{to_c_type}}(na_value) else: na_value2 = {{to_c_type}}(0) with nogil: for i in range(n): val = {{to_c_type}}(values[i]) if ignore_na and use_mask: if mask_values[i]: labels[i] = na_sentinel continue elif ignore_na and ( is_nan_{{c_type}}(val) or (use_na_value and are_equivalent_{{c_type}}(val, na_value2)) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue elif not ignore_na and use_result_mask: if mask_values[i]: if seen_na: continue seen_na = True if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") uniques.resize() if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") result_mask.resize() append_data_{{dtype}}(ud, val) append_data_uint8(rmd, 1) continue k = kh_get_{{dtype}}(self.table, val) if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") uniques.resize() if use_result_mask: if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") result_mask.resize() append_data_{{dtype}}(ud, val) if use_result_mask: append_data_uint8(rmd, 0) if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray if use_result_mask: return uniques.to_array(), result_mask.to_array() return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. mask : ndarray[bool], optional If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques result_mask: ndarray[bool], if mask is given as input The mask for the result values. """ uniques = {{name}}Vector() use_result_mask = True if mask is not None else False return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" Parameters ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. mask : ndarray[bool], optional If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = {{name}}Vector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, na_value=na_value, ignore_na=ignore_na, mask=mask, return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) return labels {{if dtype == 'int64'}} @cython.boundscheck(False) def get_labels_groupby( self, const {{dtype}}_t[:] values ) -> tuple[ndarray, ndarray]: # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]] cdef: Py_ssize_t i, n = len(values) intp_t[::1] labels Py_ssize_t idx, count = 0 int ret = 0 {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) ud = uniques.data with nogil: for i in range(n): val = {{to_c_type}}(values[i]) # specific for groupby if val < 0: labels[i] = -1 continue k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: idx = self.table.vals[k] labels[i] = idx else: k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count if needs_resize(ud): with gil: uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count count += 1 arr_uniques = uniques.to_array() return np.asarray(labels), arr_uniques {{endif}} {{endfor}} cdef class StringHashTable(HashTable): # these by-definition *must* be strings # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' def __init__(self, int64_t size_hint=1): self.table = kh_init_str() size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) kh_resize_str(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: kh_destroy_str(self.table) self.table = NULL def sizeof(self, deep: bool = False) -> int: overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) for_pairs = self.table.n_buckets * (sizeof(char *) + # keys sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs def get_state(self) -> dict[str, int]: """ returns infos about the state of the hashtable""" return { 'n_buckets' : self.table.n_buckets, 'size' : self.table.size, 'n_occupied' : self.table.n_occupied, 'upper_bound' : self.table.upper_bound, } cpdef get_item(self, str val): cdef: khiter_t k const char *v v = get_c_string(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) cpdef set_item(self, str key, Py_ssize_t val): cdef: khiter_t k int ret = 0 const char *v v = get_c_string(key) k = kh_put_str(self.table, v, &ret) if kh_exist_str(self.table, k): self.table.vals[k] = val else: raise KeyError(key) @cython.boundscheck(False) def get_indexer(self, ndarray[object] values) -> ndarray: # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) ndarray[intp_t] labels = np.empty(n, dtype=np.intp) intp_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v const char **vecs vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] v = get_c_string(val) vecs[i] = v with nogil: for i in range(n): k = kh_get_str(table, vecs[i]) if k != table.n_buckets: resbuf[i] = table.vals[k] else: resbuf[i] = -1 free(vecs) return labels @cython.boundscheck(False) def lookup(self, ndarray[object] values) -> ndarray: # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 object val const char *v khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] if isinstance(val, str): # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: for i in range(n): v = vecs[i] k = kh_get_str(self.table, v) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: locs[i] = -1 free(vecs) return np.asarray(locs) @cython.boundscheck(False) def map_locations(self, ndarray[object] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 object val const char *v const char **vecs khiter_t k # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] if isinstance(val, str): # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: for i in range(n): v = vecs[i] k = kh_put_str(self.table, v, &ret) self.table.vals[k] = i free(vecs) @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. ignore_na : bool, default False Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) intp_t[::1] labels int64_t[::1] uindexer int ret = 0 object val const char *v const char **vecs khiter_t k bint use_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] if (ignore_na and (not isinstance(val, str) or (use_na_value and val == na_value))): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. try: v = get_c_string(val) except UnicodeEncodeError: v = get_c_string(repr(val)) vecs[i] = v # compute with nogil: for i in range(n): if ignore_na and labels[i] == na_sentinel: # skip entries for ignored missing values (see above) continue v = vecs[i] k = kh_get_str(self.table, v) if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx free(vecs) # uniques for i in range(count): uniques.append(values[uindexer[i]]) if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. mask : ndarray[bool], optional Not yet implemented for StringHashTable Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. mask : ndarray[bool], optional Not yet implemented for StringHashTable. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[intp] The labels from values to uniques """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, na_value=na_value, ignore_na=ignore_na, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) return labels cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) kh_resize_pymap(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: kh_destroy_pymap(self.table) self.table = NULL def __len__(self) -> int: return self.table.size def __contains__(self, object key) -> bool: cdef: khiter_t k hash(key) k = kh_get_pymap(self.table, key) return k != self.table.n_buckets def sizeof(self, deep: bool = False) -> int: """ return the size of my table in bytes """ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs def get_state(self) -> dict[str, int]: """ returns infos about the current state of the hashtable like size, number of buckets and so on. """ return { 'n_buckets' : self.table.n_buckets, 'size' : self.table.size, 'n_occupied' : self.table.n_occupied, 'upper_bound' : self.table.upper_bound, } cpdef get_item(self, object val): cdef: khiter_t k k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k int ret = 0 char* buf hash(key) k = kh_put_pymap(self.table, key, &ret) if kh_exist_pymap(self.table, k): self.table.vals[k] = val else: raise KeyError(key) def map_locations(self, ndarray[object] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 object val khiter_t k for i in range(n): val = values[i] hash(val) k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i def lookup(self, ndarray[object] values) -> ndarray: # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 object val khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) for i in range(n): val = values[i] hash(val) k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: locs[i] = -1 return np.asarray(locs) @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. ignore_na : bool, default False Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) intp_t[::1] labels int ret = 0 object val khiter_t k bint use_na_value if return_inverse: labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) if ignore_na and ( checknull(val) or (use_na_value and val == na_value) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and # replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. mask : ndarray[bool], optional Not yet implemented for PyObjectHashTable Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. mask : ndarray[bool], optional Not yet implemented for PyObjectHashTable. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, na_value=na_value, ignore_na=ignore_na, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) return labels