Sindbad~EG File Manager
"""
data hash pandas / numpy objects
"""
from __future__ import annotations
import itertools
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.hashing import hash_object_array
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCExtensionArray,
ABCIndex,
ABCMultiIndex,
ABCSeries,
)
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterable,
Iterator,
)
from pandas._typing import (
ArrayLike,
npt,
)
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
)
# 16 byte long hashing key
_default_hash_key = "0123456789123456"
def combine_hash_arrays(
arrays: Iterator[np.ndarray], num_items: int
) -> npt.NDArray[np.uint64]:
"""
Parameters
----------
arrays : Iterator[np.ndarray]
num_items : int
Returns
-------
np.ndarray[uint64]
Should be the same as CPython's tupleobject.c
"""
try:
first = next(arrays)
except StopIteration:
return np.array([], dtype=np.uint64)
arrays = itertools.chain([first], arrays)
mult = np.uint64(1000003)
out = np.zeros_like(first) + np.uint64(0x345678)
last_i = 0
for i, a in enumerate(arrays):
inverse_i = num_items - i
out ^= a
out *= mult
mult += np.uint64(82520 + inverse_i + inverse_i)
last_i = i
assert last_i + 1 == num_items, "Fed in wrong num_items"
out += np.uint64(97531)
return out
def hash_pandas_object(
obj: Index | DataFrame | Series,
index: bool = True,
encoding: str = "utf8",
hash_key: str | None = _default_hash_key,
categorize: bool = True,
) -> Series:
"""
Return a data hash of the Index/Series/DataFrame.
Parameters
----------
obj : Index, Series, or DataFrame
index : bool, default True
Include the index in the hash (if Series/DataFrame).
encoding : str, default 'utf8'
Encoding for data & key when strings.
hash_key : str, default _default_hash_key
Hash_key for string key to encode.
categorize : bool, default True
Whether to first categorize object arrays before hashing. This is more
efficient when the array contains duplicate values.
Returns
-------
Series of uint64, same length as the object
Examples
--------
>>> pd.util.hash_pandas_object(pd.Series([1, 2, 3]))
0 14639053686158035780
1 3869563279212530728
2 393322362522515241
dtype: uint64
"""
from pandas import Series
if hash_key is None:
hash_key = _default_hash_key
if isinstance(obj, ABCMultiIndex):
return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
elif isinstance(obj, ABCIndex):
h = hash_array(obj._values, encoding, hash_key, categorize).astype(
"uint64", copy=False
)
ser = Series(h, index=obj, dtype="uint64", copy=False)
elif isinstance(obj, ABCSeries):
h = hash_array(obj._values, encoding, hash_key, categorize).astype(
"uint64", copy=False
)
if index:
index_iter = (
hash_pandas_object(
obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize,
)._values
for _ in [None]
)
arrays = itertools.chain([h], index_iter)
h = combine_hash_arrays(arrays, 2)
ser = Series(h, index=obj.index, dtype="uint64", copy=False)
elif isinstance(obj, ABCDataFrame):
hashes = (
hash_array(series._values, encoding, hash_key, categorize)
for _, series in obj.items()
)
num_items = len(obj.columns)
if index:
index_hash_generator = (
hash_pandas_object(
obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize,
)._values
for _ in [None]
)
num_items += 1
# keep `hashes` specifically a generator to keep mypy happy
_hashes = itertools.chain(hashes, index_hash_generator)
hashes = (x for x in _hashes)
h = combine_hash_arrays(hashes, num_items)
ser = Series(h, index=obj.index, dtype="uint64", copy=False)
else:
raise TypeError(f"Unexpected type for hashing {type(obj)}")
return ser
def hash_tuples(
vals: MultiIndex | Iterable[tuple[Hashable, ...]],
encoding: str = "utf8",
hash_key: str = _default_hash_key,
) -> npt.NDArray[np.uint64]:
"""
Hash an MultiIndex / listlike-of-tuples efficiently.
Parameters
----------
vals : MultiIndex or listlike-of-tuples
encoding : str, default 'utf8'
hash_key : str, default _default_hash_key
Returns
-------
ndarray[np.uint64] of hashed values
"""
if not is_list_like(vals):
raise TypeError("must be convertible to a list-of-tuples")
from pandas import (
Categorical,
MultiIndex,
)
if not isinstance(vals, ABCMultiIndex):
mi = MultiIndex.from_tuples(vals)
else:
mi = vals
# create a list-of-Categoricals
cat_vals = [
Categorical._simple_new(
mi.codes[level],
CategoricalDtype(categories=mi.levels[level], ordered=False),
)
for level in range(mi.nlevels)
]
# hash the list-of-ndarrays
hashes = (
cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
for cat in cat_vals
)
h = combine_hash_arrays(hashes, len(cat_vals))
return h
def hash_array(
vals: ArrayLike,
encoding: str = "utf8",
hash_key: str = _default_hash_key,
categorize: bool = True,
) -> npt.NDArray[np.uint64]:
"""
Given a 1d array, return an array of deterministic integers.
Parameters
----------
vals : ndarray or ExtensionArray
encoding : str, default 'utf8'
Encoding for data & key when strings.
hash_key : str, default _default_hash_key
Hash_key for string key to encode.
categorize : bool, default True
Whether to first categorize object arrays before hashing. This is more
efficient when the array contains duplicate values.
Returns
-------
ndarray[np.uint64, ndim=1]
Hashed values, same length as the vals.
Examples
--------
>>> pd.util.hash_array(np.array([1, 2, 3]))
array([ 6238072747940578789, 15839785061582574730, 2185194620014831856],
dtype=uint64)
"""
if not hasattr(vals, "dtype"):
raise TypeError("must pass a ndarray-like")
if isinstance(vals, ABCExtensionArray):
return vals._hash_pandas_object(
encoding=encoding, hash_key=hash_key, categorize=categorize
)
if not isinstance(vals, np.ndarray):
# GH#42003
raise TypeError(
"hash_array requires np.ndarray or ExtensionArray, not "
f"{type(vals).__name__}. Use hash_pandas_object instead."
)
return _hash_ndarray(vals, encoding, hash_key, categorize)
def _hash_ndarray(
vals: np.ndarray,
encoding: str = "utf8",
hash_key: str = _default_hash_key,
categorize: bool = True,
) -> npt.NDArray[np.uint64]:
"""
See hash_array.__doc__.
"""
dtype = vals.dtype
# _hash_ndarray only takes 64-bit values, so handle 128-bit by parts
if np.issubdtype(dtype, np.complex128):
hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize)
hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize)
return hash_real + 23 * hash_imag
# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if dtype == bool:
vals = vals.astype("u8")
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view("i8").astype("u8", copy=False)
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
else:
# With repeated values, its MUCH faster to categorize object dtypes,
# then hash and rename categories. We allow skipping the categorization
# when the values are known/likely to be unique.
if categorize:
from pandas import (
Categorical,
Index,
factorize,
)
codes, categories = factorize(vals, sort=False)
dtype = CategoricalDtype(categories=Index(categories), ordered=False)
cat = Categorical._simple_new(codes, dtype)
return cat._hash_pandas_object(
encoding=encoding, hash_key=hash_key, categorize=False
)
try:
vals = hash_object_array(vals, hash_key, encoding)
except TypeError:
# we have mixed types
vals = hash_object_array(
vals.astype(str).astype(object), hash_key, encoding
)
# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
vals *= np.uint64(0xBF58476D1CE4E5B9)
vals ^= vals >> 27
vals *= np.uint64(0x94D049BB133111EB)
vals ^= vals >> 31
return vals
Sindbad File Manager Version 1.0, Coded By Sindbad EG ~ The Terrorists