cimport cython from cpython.mem cimport ( PyMem_Free, PyMem_Malloc, ) from cpython.ref cimport ( Py_INCREF, PyObject, ) from libc.stdlib cimport ( free, malloc, ) import numpy as np cimport numpy as cnp from numpy cimport ( float64_t, ndarray, uint8_t, uint32_t, ) from numpy.math cimport NAN cnp.import_array() from pandas._libs cimport util from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.khash cimport ( KHASH_TRACE_DOMAIN, are_equivalent_float32_t, are_equivalent_float64_t, are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, kh_needed_n_buckets, kh_python_hash_equal, kh_python_hash_func, kh_str_t, khcomplex64_t, khcomplex128_t, khiter_t, ) from pandas._libs.missing cimport checknull def get_hashtable_trace_domain(): return KHASH_TRACE_DOMAIN def object_hash(obj): return kh_python_hash_func(obj) def objects_are_equal(a, b): return kh_python_hash_equal(a, b) cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 cdef Py_ssize_t _INIT_VEC_CAP = 128 include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" # map derived hash-map types onto basic hash-map types: if np.dtype(np.intp) == np.dtype(np.int64): IntpHashTable = Int64HashTable unique_label_indices = _unique_label_indices_int64 elif np.dtype(np.intp) == np.dtype(np.int32): IntpHashTable = Int32HashTable unique_label_indices = _unique_label_indices_int32 else: raise ValueError(np.dtype(np.intp)) cdef class Factorizer: cdef readonly: Py_ssize_t count def __cinit__(self, size_hint: int): self.count = 0 def get_count(self) -> int: return self.count cdef class ObjectFactorizer(Factorizer): cdef public: PyObjectHashTable table ObjectVector uniques def __cinit__(self, size_hint: int): self.table = PyObjectHashTable(size_hint) self.uniques = ObjectVector() def factorize( self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None ) -> np.ndarray: """ Returns ------- np.ndarray[np.intp] Examples -------- Factorize values with nans replaced by na_sentinel >>> fac = ObjectFactorizer(3) >>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ cdef: ndarray[intp_t] labels if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value) mask = (labels == na_sentinel) # sort on if sort: sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels, mode='clip') labels[mask] = na_sentinel self.count = len(self.uniques) return labels cdef class Int64Factorizer(Factorizer): cdef public: Int64HashTable table Int64Vector uniques def __cinit__(self, size_hint: int): self.table = Int64HashTable(size_hint) self.uniques = Int64Vector() def factorize(self, const int64_t[:] values, sort=False, na_sentinel=-1, na_value=None) -> np.ndarray: """ Returns ------- ndarray[intp_t] Examples -------- Factorize values with nans replaced by na_sentinel >>> fac = Int64Factorizer(3) >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) array([0, 1, 2]) """ cdef: ndarray[intp_t] labels if self.uniques.external_view_exists: uniques = Int64Vector() uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value=na_value) # sort on if sort: sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels) self.count = len(self.uniques) return labels