Sindbad~EG File Manager
"""
Generic data algorithms. This module is experimental at the moment and not
intended for public consumption
"""
from __future__ import annotations
import decimal
import operator
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Literal,
cast,
)
import warnings
import numpy as np
from pandas._libs import (
algos,
hashtable as htable,
iNaT,
lib,
)
from pandas._typing import (
AnyArrayLike,
ArrayLike,
AxisInt,
DtypeObj,
TakeIndexer,
npt,
)
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike,
np_find_common_type,
)
from pandas.core.dtypes.common import (
ensure_float64,
ensure_object,
ensure_platform_int,
is_array_like,
is_bool_dtype,
is_complex_dtype,
is_dict_like,
is_extension_array_dtype,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_signed_integer_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
NumpyEADtype,
)
from pandas.core.dtypes.generic import (
ABCDatetimeArray,
ABCExtensionArray,
ABCIndex,
ABCMultiIndex,
ABCSeries,
ABCTimedeltaArray,
)
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
from pandas.core.array_algos.take import take_nd
from pandas.core.construction import (
array as pd_array,
ensure_wrapped_if_datetimelike,
extract_array,
)
from pandas.core.indexers import validate_indices
if TYPE_CHECKING:
from pandas._typing import (
ListLike,
NumpySorter,
NumpyValueArrayLike,
)
from pandas import (
Categorical,
Index,
Series,
)
from pandas.core.arrays import (
BaseMaskedArray,
ExtensionArray,
)
# --------------- #
# dtype access #
# --------------- #
def _ensure_data(values: ArrayLike) -> np.ndarray:
"""
routine to ensure that our data is of the correct
input dtype for lower-level routines
This will coerce:
- ints -> int64
- uint -> uint64
- bool -> uint8
- datetimelike -> i8
- datetime64tz -> i8 (in local tz)
- categorical -> codes
Parameters
----------
values : np.ndarray or ExtensionArray
Returns
-------
np.ndarray
"""
if not isinstance(values, ABCMultiIndex):
# extract_array would raise
values = extract_array(values, extract_numpy=True)
if is_object_dtype(values.dtype):
return ensure_object(np.asarray(values))
elif isinstance(values.dtype, BaseMaskedDtype):
# i.e. BooleanArray, FloatingArray, IntegerArray
values = cast("BaseMaskedArray", values)
if not values._hasna:
# No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816
# recurse to avoid re-implementing logic for eg bool->uint8
return _ensure_data(values._data)
return np.asarray(values)
elif isinstance(values.dtype, CategoricalDtype):
# NB: cases that go through here should NOT be using _reconstruct_data
# on the back-end.
values = cast("Categorical", values)
return values.codes
elif is_bool_dtype(values.dtype):
if isinstance(values, np.ndarray):
# i.e. actually dtype == np.dtype("bool")
return np.asarray(values).view("uint8")
else:
# e.g. Sparse[bool, False] # TODO: no test cases get here
return np.asarray(values).astype("uint8", copy=False)
elif is_integer_dtype(values.dtype):
return np.asarray(values)
elif is_float_dtype(values.dtype):
# Note: checking `values.dtype == "float128"` raises on Windows and 32bit
# error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
# has no attribute "itemsize"
if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr]
# we dont (yet) have float128 hashtable support
return ensure_float64(values)
return np.asarray(values)
elif is_complex_dtype(values.dtype):
return cast(np.ndarray, values)
# datetimelike
elif needs_i8_conversion(values.dtype):
npvalues = values.view("i8")
npvalues = cast(np.ndarray, npvalues)
return npvalues
# we have failed, return object
values = np.asarray(values, dtype=object)
return ensure_object(values)
def _reconstruct_data(
values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
) -> ArrayLike:
"""
reverse of _ensure_data
Parameters
----------
values : np.ndarray or ExtensionArray
dtype : np.dtype or ExtensionDtype
original : AnyArrayLike
Returns
-------
ExtensionArray or np.ndarray
"""
if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
# Catch DatetimeArray/TimedeltaArray
return values
if not isinstance(dtype, np.dtype):
# i.e. ExtensionDtype; note we have ruled out above the possibility
# that values.dtype == dtype
cls = dtype.construct_array_type()
values = cls._from_sequence(values, dtype=dtype)
else:
values = values.astype(dtype, copy=False)
return values
def _ensure_arraylike(values, func_name: str) -> ArrayLike:
"""
ensure that we are arraylike if not already
"""
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
# GH#52986
if func_name != "isin-targets":
# Make an exception for the comps argument in isin.
warnings.warn(
f"{func_name} with argument that is not not a Series, Index, "
"ExtensionArray, or np.ndarray is deprecated and will raise in a "
"future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
inferred = lib.infer_dtype(values, skipna=False)
if inferred in ["mixed", "string", "mixed-integer"]:
# "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
if isinstance(values, tuple):
values = list(values)
values = construct_1d_object_array_from_listlike(values)
else:
values = np.asarray(values)
return values
_hashtables = {
"complex128": htable.Complex128HashTable,
"complex64": htable.Complex64HashTable,
"float64": htable.Float64HashTable,
"float32": htable.Float32HashTable,
"uint64": htable.UInt64HashTable,
"uint32": htable.UInt32HashTable,
"uint16": htable.UInt16HashTable,
"uint8": htable.UInt8HashTable,
"int64": htable.Int64HashTable,
"int32": htable.Int32HashTable,
"int16": htable.Int16HashTable,
"int8": htable.Int8HashTable,
"string": htable.StringHashTable,
"object": htable.PyObjectHashTable,
}
def _get_hashtable_algo(values: np.ndarray):
"""
Parameters
----------
values : np.ndarray
Returns
-------
htable : HashTable subclass
values : ndarray
"""
values = _ensure_data(values)
ndtype = _check_object_for_strings(values)
hashtable = _hashtables[ndtype]
return hashtable, values
def _check_object_for_strings(values: np.ndarray) -> str:
"""
Check if we can use string hashtable instead of object hashtable.
Parameters
----------
values : ndarray
Returns
-------
str
"""
ndtype = values.dtype.name
if ndtype == "object":
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.is_string_array(values, skipna=False):
ndtype = "string"
return ndtype
# --------------- #
# top-level algos #
# --------------- #
def unique(values):
"""
Return unique values based on a hash table.
Uniques are returned in order of appearance. This does NOT sort.
Significantly faster than numpy.unique for long enough sequences.
Includes NA values.
Parameters
----------
values : 1d array-like
Returns
-------
numpy.ndarray or ExtensionArray
The return can be:
* Index : when the input is an Index
* Categorical : when the input is a Categorical dtype
* ndarray : when the input is a Series/ndarray
Return numpy.ndarray or ExtensionArray.
See Also
--------
Index.unique : Return unique values from an Index.
Series.unique : Return unique values of Series object.
Examples
--------
>>> pd.unique(pd.Series([2, 1, 3, 3]))
array([2, 1, 3])
>>> pd.unique(pd.Series([2] + [1] * 5))
array([2, 1])
>>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
>>> pd.unique(
... pd.Series(
... [
... pd.Timestamp("20160101", tz="US/Eastern"),
... pd.Timestamp("20160101", tz="US/Eastern"),
... ]
... )
... )
<DatetimeArray>
['2016-01-01 00:00:00-05:00']
Length: 1, dtype: datetime64[ns, US/Eastern]
>>> pd.unique(
... pd.Index(
... [
... pd.Timestamp("20160101", tz="US/Eastern"),
... pd.Timestamp("20160101", tz="US/Eastern"),
... ]
... )
... )
DatetimeIndex(['2016-01-01 00:00:00-05:00'],
dtype='datetime64[ns, US/Eastern]',
freq=None)
>>> pd.unique(np.array(list("baabc"), dtype="O"))
array(['b', 'a', 'c'], dtype=object)
An unordered Categorical will return categories in the
order of appearance.
>>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))
['b', 'a', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
['b', 'a', 'c']
Categories (3, object): ['a', 'b', 'c']
An ordered Categorical preserves the category ordering.
>>> pd.unique(
... pd.Series(
... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
... )
... )
['b', 'a', 'c']
Categories (3, object): ['a' < 'b' < 'c']
An array of tuples
>>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
"""
return unique_with_mask(values)
def nunique_ints(values: ArrayLike) -> int:
"""
Return the number of unique values for integer array-likes.
Significantly faster than pandas.unique for long enough sequences.
No checks are done to ensure input is integral.
Parameters
----------
values : 1d array-like
Returns
-------
int : The number of unique values in ``values``
"""
if len(values) == 0:
return 0
values = _ensure_data(values)
# bincount requires intp
result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
return result
def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
values = _ensure_arraylike(values, func_name="unique")
if isinstance(values.dtype, ExtensionDtype):
# Dispatch to extension dtype's unique.
return values.unique()
original = values
hashtable, values = _get_hashtable_algo(values)
table = hashtable(len(values))
if mask is None:
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, original.dtype, original)
return uniques
else:
uniques, mask = table.unique(values, mask=mask)
uniques = _reconstruct_data(uniques, original.dtype, original)
assert mask is not None # for mypy
return uniques, mask.astype("bool")
unique1d = unique
_MINIMUM_COMP_ARR_LEN = 1_000_000
def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
"""
Compute the isin boolean array.
Parameters
----------
comps : list-like
values : list-like
Returns
-------
ndarray[bool]
Same length as `comps`.
"""
if not is_list_like(comps):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a `{type(comps).__name__}`"
)
if not is_list_like(values):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a `{type(values).__name__}`"
)
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
orig_values = list(values)
values = _ensure_arraylike(orig_values, func_name="isin-targets")
if (
len(values) > 0
and values.dtype.kind in "iufcb"
and not is_signed_integer_dtype(comps)
):
# GH#46485 Use object to avoid upcast to float64 later
# TODO: Share with _find_common_type_compat
values = construct_1d_object_array_from_listlike(orig_values)
elif isinstance(values, ABCMultiIndex):
# Avoid raising in extract_array
values = np.array(values)
else:
values = extract_array(values, extract_numpy=True, extract_range=True)
comps_array = _ensure_arraylike(comps, func_name="isin")
comps_array = extract_array(comps_array, extract_numpy=True)
if not isinstance(comps_array, np.ndarray):
# i.e. Extension Array
return comps_array.isin(values)
elif needs_i8_conversion(comps_array.dtype):
# Dispatch to DatetimeLikeArrayMixin.isin
return pd_array(comps_array).isin(values)
elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):
# e.g. comps_array are integers and values are datetime64s
return np.zeros(comps_array.shape, dtype=bool)
# TODO: not quite right ... Sparse/Categorical
elif needs_i8_conversion(values.dtype):
return isin(comps_array, values.astype(object))
elif isinstance(values.dtype, ExtensionDtype):
return isin(np.asarray(comps_array), np.asarray(values))
# GH16012
# Ensure np.isin doesn't get object types or it *may* throw an exception
# Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
# isin is faster for small sizes
if (
len(comps_array) > _MINIMUM_COMP_ARR_LEN
and len(values) <= 26
and comps_array.dtype != object
):
# If the values include nan we need to check for nan explicitly
# since np.nan it not equal to np.nan
if isna(values).any():
def f(c, v):
return np.logical_or(np.isin(c, v).ravel(), np.isnan(c))
else:
f = lambda a, b: np.isin(a, b).ravel()
else:
common = np_find_common_type(values.dtype, comps_array.dtype)
values = values.astype(common, copy=False)
comps_array = comps_array.astype(common, copy=False)
f = htable.ismember
return f(comps_array, values)
def factorize_array(
values: np.ndarray,
use_na_sentinel: bool = True,
size_hint: int | None = None,
na_value: object = None,
mask: npt.NDArray[np.bool_] | None = None,
) -> tuple[npt.NDArray[np.intp], np.ndarray]:
"""
Factorize a numpy array to codes and uniques.
This doesn't do any coercion of types or unboxing before factorization.
Parameters
----------
values : ndarray
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
size_hint : int, optional
Passed through to the hashtable's 'get_labels' method
na_value : object, optional
A value in `values` to consider missing. Note: only use this
parameter when you know that you don't have any values pandas would
consider missing in the array (NaN for float data, iNaT for
datetimes, etc.).
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
Returns
-------
codes : ndarray[np.intp]
uniques : ndarray
"""
original = values
if values.dtype.kind in "mM":
# _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
# need to do the same to na_value. We are assuming here that the passed
# na_value is an appropriately-typed NaT.
# e.g. test_where_datetimelike_categorical
na_value = iNaT
hash_klass, values = _get_hashtable_algo(values)
table = hash_klass(size_hint or len(values))
uniques, codes = table.factorize(
values,
na_sentinel=-1,
na_value=na_value,
mask=mask,
ignore_na=use_na_sentinel,
)
# re-cast e.g. i8->dt64/td64, uint8->bool
uniques = _reconstruct_data(uniques, original.dtype, original)
codes = ensure_platform_int(codes)
return codes, uniques
@doc(
values=dedent(
"""\
values : sequence
A 1-D sequence. Sequences that aren't pandas objects are
coerced to ndarrays before factorization.
"""
),
sort=dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
size_hint=dedent(
"""\
size_hint : int, optional
Hint to the hashtable sizer.
"""
),
)
def factorize(
values,
sort: bool = False,
use_na_sentinel: bool = True,
size_hint: int | None = None,
) -> tuple[np.ndarray, np.ndarray | Index]:
"""
Encode the object as an enumerated type or categorical variable.
This method is useful for obtaining a numeric representation of an
array when all that matters is identifying distinct values. `factorize`
is available as both a top-level function :func:`pandas.factorize`,
and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
Parameters
----------
{values}{sort}
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
.. versionadded:: 1.5.0
{size_hint}\
Returns
-------
codes : ndarray
An integer ndarray that's an indexer into `uniques`.
``uniques.take(codes)`` will have the same values as `values`.
uniques : ndarray, Index, or Categorical
The unique valid values. When `values` is Categorical, `uniques`
is a Categorical. When `values` is some other pandas object, an
`Index` is returned. Otherwise, a 1-D ndarray is returned.
.. note::
Even if there's a missing value in `values`, `uniques` will
*not* contain an entry for it.
See Also
--------
cut : Discretize continuous-valued array.
unique : Find the unique value in an array.
Notes
-----
Reference :ref:`the user guide <reshaping.factorize>` for more examples.
Examples
--------
These examples all show factorize as a top-level method like
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.
>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"))
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.
>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
... sort=True)
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
array(['a', 'b', 'c'], dtype=object)
When ``use_na_sentinel=True`` (the default), missing values are indicated in
the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.
>>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
Thus far, we've only factorized lists (which are internally coerced to
NumPy arrays). When factorizing pandas objects, the type of `uniques`
will differ. For Categoricals, a `Categorical` is returned.
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
['a', 'c']
Categories (3, object): ['a', 'b', 'c']
Notice that ``'b'`` is in ``uniques.categories``, despite not being
present in ``cat.values``.
For all other pandas objects, an Index of the appropriate type is
returned.
>>> cat = pd.Series(['a', 'a', 'c'])
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
Index(['a', 'c'], dtype='object')
If NaN is in the values, and we want to include NaN in the uniques of the
values, it can be achieved by setting ``use_na_sentinel=False``.
>>> values = np.array([1, 2, 1, np.nan])
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
>>> codes
array([ 0, 1, 0, -1])
>>> uniques
array([1., 2.])
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
>>> codes
array([0, 1, 0, 2])
>>> uniques
array([ 1., 2., nan])
"""
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
# 2.) factorizing codes and uniques
# 3.) Maybe boxing the uniques in an Index
#
# Step 2 is dispatched to extension types (like Categorical). They are
# responsible only for factorization. All data coercion, sorting and boxing
# should happen here.
if isinstance(values, (ABCIndex, ABCSeries)):
return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
values = _ensure_arraylike(values, func_name="factorize")
original = values
if (
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
and values.freq is not None
):
# The presence of 'freq' means we can fast-path sorting and know there
# aren't NAs
codes, uniques = values.factorize(sort=sort)
return codes, uniques
elif not isinstance(values, np.ndarray):
# i.e. ExtensionArray
codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
else:
values = np.asarray(values) # convert DTA/TDA/MultiIndex
if not use_na_sentinel and values.dtype == object:
# factorize can now handle differentiating various types of null values.
# These can only occur when the array has object dtype.
# However, for backwards compatibility we only use the null for the
# provided dtype. This may be revisited in the future, see GH#48476.
null_mask = isna(values)
if null_mask.any():
na_value = na_value_for_dtype(values.dtype, compat=False)
# Don't modify (potentially user-provided) array
values = np.where(null_mask, na_value, values)
codes, uniques = factorize_array(
values,
use_na_sentinel=use_na_sentinel,
size_hint=size_hint,
)
if sort and len(uniques) > 0:
uniques, codes = safe_sort(
uniques,
codes,
use_na_sentinel=use_na_sentinel,
assume_unique=True,
verify=False,
)
uniques = _reconstruct_data(uniques, original.dtype, original)
return codes, uniques
def value_counts(
values,
sort: bool = True,
ascending: bool = False,
normalize: bool = False,
bins=None,
dropna: bool = True,
) -> Series:
"""
Compute a histogram of the counts of non-null values.
Parameters
----------
values : ndarray (1-d)
sort : bool, default True
Sort by values
ascending : bool, default False
Sort in ascending order
normalize: bool, default False
If True then compute a relative histogram
bins : integer, optional
Rather than count values, group them into half-open bins,
convenience for pd.cut, only works with numeric data
dropna : bool, default True
Don't include counts of NaN
Returns
-------
Series
"""
warnings.warn(
# GH#53493
"pandas.value_counts is deprecated and will be removed in a "
"future version. Use pd.Series(obj).value_counts() instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
return value_counts_internal(
values,
sort=sort,
ascending=ascending,
normalize=normalize,
bins=bins,
dropna=dropna,
)
def value_counts_internal(
values,
sort: bool = True,
ascending: bool = False,
normalize: bool = False,
bins=None,
dropna: bool = True,
) -> Series:
from pandas import (
Index,
Series,
)
index_name = getattr(values, "name", None)
name = "proportion" if normalize else "count"
if bins is not None:
from pandas.core.reshape.tile import cut
if isinstance(values, Series):
values = values._values
try:
ii = cut(values, bins, include_lowest=True)
except TypeError as err:
raise TypeError("bins argument only works with numeric data.") from err
# count, remove nulls (from the index), and but the bins
result = ii.value_counts(dropna=dropna)
result.name = name
result = result[result.index.notna()]
result.index = result.index.astype("interval")
result = result.sort_index()
# if we are dropna and we have NO values
if dropna and (result._values == 0).all():
result = result.iloc[0:0]
# normalizing is by len of all (regardless of dropna)
counts = np.array([len(ii)])
else:
if is_extension_array_dtype(values):
# handle Categorical and sparse,
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
result.name = name
result.index.name = index_name
counts = result._values
if not isinstance(counts, np.ndarray):
# e.g. ArrowExtensionArray
counts = np.asarray(counts)
elif isinstance(values, ABCMultiIndex):
# GH49558
levels = list(range(values.nlevels))
result = (
Series(index=values, name=name)
.groupby(level=levels, dropna=dropna)
.size()
)
result.index.names = values.names
counts = result._values
else:
values = _ensure_arraylike(values, func_name="value_counts")
keys, counts, _ = value_counts_arraylike(values, dropna)
if keys.dtype == np.float16:
keys = keys.astype(np.float32)
# For backwards compatibility, we let Index do its normal type
# inference, _except_ for if if infers from object to bool.
idx = Index(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)
elif (
idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714
and idx.dtype != "string[pyarrow_numpy]"
):
warnings.warn(
# GH#56161
"The behavior of value_counts with object-dtype is deprecated. "
"In a future version, this will *not* perform dtype inference "
"on the resulting index. To retain the old behavior, use "
"`result.index = result.index.infer_objects()`",
FutureWarning,
stacklevel=find_stack_level(),
)
idx.name = index_name
result = Series(counts, index=idx, name=name, copy=False)
if sort:
result = result.sort_values(ascending=ascending)
if normalize:
result = result / counts.sum()
return result
# Called once from SparseArray, otherwise could be private
def value_counts_arraylike(
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
"""
Parameters
----------
values : np.ndarray
dropna : bool
mask : np.ndarray[bool] or None, default None
Returns
-------
uniques : np.ndarray
counts : np.ndarray[np.int64]
"""
original = values
values = _ensure_data(values)
keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)
if needs_i8_conversion(original.dtype):
# datetime, timedelta, or period
if dropna:
mask = keys != iNaT
keys, counts = keys[mask], counts[mask]
res_keys = _reconstruct_data(keys, original.dtype, original)
return res_keys, counts, na_counter
def duplicated(
values: ArrayLike,
keep: Literal["first", "last", False] = "first",
mask: npt.NDArray[np.bool_] | None = None,
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.
Parameters
----------
values : np.ndarray or ExtensionArray
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
mask : ndarray[bool], optional
array indicating which elements to exclude from checking
Returns
-------
duplicated : ndarray[bool]
"""
values = _ensure_data(values)
return htable.duplicated(values, keep=keep, mask=mask)
def mode(
values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
) -> ArrayLike:
"""
Returns the mode(s) of an array.
Parameters
----------
values : array-like
Array over which to check for duplicate values.
dropna : bool, default True
Don't consider counts of NaN/NaT.
Returns
-------
np.ndarray or ExtensionArray
"""
values = _ensure_arraylike(values, func_name="mode")
original = values
if needs_i8_conversion(values.dtype):
# Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
values = ensure_wrapped_if_datetimelike(values)
values = cast("ExtensionArray", values)
return values._mode(dropna=dropna)
values = _ensure_data(values)
npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask)
if res_mask is not None:
return npresult, res_mask # type: ignore[return-value]
try:
npresult = np.sort(npresult)
except TypeError as err:
warnings.warn(
f"Unable to sort modes: {err}",
stacklevel=find_stack_level(),
)
result = _reconstruct_data(npresult, original.dtype, original)
return result
def rank(
values: ArrayLike,
axis: AxisInt = 0,
method: str = "average",
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
) -> npt.NDArray[np.float64]:
"""
Rank the values along a given axis.
Parameters
----------
values : np.ndarray or ExtensionArray
Array whose values will be ranked. The number of dimensions in this
array must not exceed 2.
axis : int, default 0
Axis over which to perform rankings.
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
The method by which tiebreaks are broken during the ranking.
na_option : {'keep', 'top'}, default 'keep'
The method by which NaNs are placed in the ranking.
- ``keep``: rank each NaN value with a NaN ranking
- ``top``: replace each NaN with either +/- inf so that they
there are ranked at the top
ascending : bool, default True
Whether or not the elements should be ranked in ascending order.
pct : bool, default False
Whether or not to the display the returned rankings in integer form
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
"""
is_datetimelike = needs_i8_conversion(values.dtype)
values = _ensure_data(values)
if values.ndim == 1:
ranks = algos.rank_1d(
values,
is_datetimelike=is_datetimelike,
ties_method=method,
ascending=ascending,
na_option=na_option,
pct=pct,
)
elif values.ndim == 2:
ranks = algos.rank_2d(
values,
axis=axis,
is_datetimelike=is_datetimelike,
ties_method=method,
ascending=ascending,
na_option=na_option,
pct=pct,
)
else:
raise TypeError("Array with ndim > 2 are not supported.")
return ranks
# ---- #
# take #
# ---- #
def take(
arr,
indices: TakeIndexer,
axis: AxisInt = 0,
allow_fill: bool = False,
fill_value=None,
):
"""
Take elements from an array.
Parameters
----------
arr : array-like or scalar value
Non array-likes (sequences/scalars without a dtype) are coerced
to an ndarray.
.. deprecated:: 2.1.0
Passing an argument other than a numpy.ndarray, ExtensionArray,
Index, or Series is deprecated.
indices : sequence of int or one-dimensional np.ndarray of int
Indices to be taken.
axis : int, default 0
The axis over which to select values.
allow_fill : bool, default False
How to handle negative values in `indices`.
* False: negative values in `indices` indicate positional indices
from the right (the default). This is similar to :func:`numpy.take`.
* True: negative values in `indices` indicate
missing values. These values are set to `fill_value`. Any other
negative values raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indices when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type (``self.dtype.na_value``) is used.
For multi-dimensional `arr`, each *element* is filled with
`fill_value`.
Returns
-------
ndarray or ExtensionArray
Same type as the input.
Raises
------
IndexError
When `indices` is out of bounds for the array.
ValueError
When the indexer contains negative values other than ``-1``
and `allow_fill` is True.
Notes
-----
When `allow_fill` is False, `indices` may be whatever dimensionality
is accepted by NumPy for `arr`.
When `allow_fill` is True, `indices` should be 1-D.
See Also
--------
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> import pandas as pd
With the default ``allow_fill=False``, negative numbers indicate
positional indices from the right.
>>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])
array([10, 10, 30])
Setting ``allow_fill=True`` will place `fill_value` in those positions.
>>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
array([10., 10., nan])
>>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
... fill_value=-10)
array([ 10, 10, -10])
"""
if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)):
# GH#52981
warnings.warn(
"pd.api.extensions.take accepting non-standard inputs is deprecated "
"and will raise in a future version. Pass either a numpy.ndarray, "
"ExtensionArray, Index, or Series instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
if not is_array_like(arr):
arr = np.asarray(arr)
indices = ensure_platform_int(indices)
if allow_fill:
# Pandas style, -1 means NA
validate_indices(indices, arr.shape[axis])
result = take_nd(
arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
)
else:
# NumPy style
result = arr.take(indices, axis=axis)
return result
# ------------ #
# searchsorted #
# ------------ #
def searchsorted(
arr: ArrayLike,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
"""
Find indices where elements should be inserted to maintain order.
Find the indices into a sorted array `arr` (a) such that, if the
corresponding elements in `value` were inserted before the indices,
the order of `arr` would be preserved.
Assuming that `arr` is sorted:
====== ================================
`side` returned index `i` satisfies
====== ================================
left ``arr[i-1] < value <= self[i]``
right ``arr[i-1] <= value < self[i]``
====== ================================
Parameters
----------
arr: np.ndarray, ExtensionArray, Series
Input array. If `sorter` is None, then it must be sorted in
ascending order, otherwise `sorter` must be an array of indices
that sort it.
value : array-like or scalar
Values to insert into `arr`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
sorter : 1-D array-like, optional
Optional array of integer indices that sort array a into ascending
order. They are typically the result of argsort.
Returns
-------
array of ints or int
If value is array-like, array of insertion points.
If value is scalar, a single integer.
See Also
--------
numpy.searchsorted : Similar method from NumPy.
"""
if sorter is not None:
sorter = ensure_platform_int(sorter)
if (
isinstance(arr, np.ndarray)
and arr.dtype.kind in "iu"
and (is_integer(value) or is_integer_dtype(value))
):
# if `arr` and `value` have different dtypes, `arr` would be
# recast by numpy, causing a slow search.
# Before searching below, we therefore try to give `value` the
# same dtype as `arr`, while guarding against integer overflows.
iinfo = np.iinfo(arr.dtype.type)
value_arr = np.array([value]) if is_integer(value) else np.array(value)
if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
# value within bounds, so no overflow, so can convert value dtype
# to dtype of arr
dtype = arr.dtype
else:
dtype = value_arr.dtype
if is_integer(value):
# We know that value is int
value = cast(int, dtype.type(value))
else:
value = pd_array(cast(ArrayLike, value), dtype=dtype)
else:
# E.g. if `arr` is an array with dtype='datetime64[ns]'
# and `value` is a pd.Timestamp, we may need to convert value
arr = ensure_wrapped_if_datetimelike(arr)
# Argument 1 to "searchsorted" of "ndarray" has incompatible type
# "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"
return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]
# ---- #
# diff #
# ---- #
_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
def diff(arr, n: int, axis: AxisInt = 0):
"""
difference of n between self,
analogous to s-s.shift(n)
Parameters
----------
arr : ndarray or ExtensionArray
n : int
number of periods
axis : {0, 1}
axis to shift on
stacklevel : int, default 3
The stacklevel for the lost dtype warning.
Returns
-------
shifted
"""
n = int(n)
na = np.nan
dtype = arr.dtype
is_bool = is_bool_dtype(dtype)
if is_bool:
op = operator.xor
else:
op = operator.sub
if isinstance(dtype, NumpyEADtype):
# NumpyExtensionArray cannot necessarily hold shifted versions of itself.
arr = arr.to_numpy()
dtype = arr.dtype
if not isinstance(arr, np.ndarray):
# i.e ExtensionArray
if hasattr(arr, f"__{op.__name__}__"):
if axis != 0:
raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
return op(arr, arr.shift(n))
else:
raise TypeError(
f"{type(arr).__name__} has no 'diff' method. "
"Convert to a suitable dtype prior to calling 'diff'."
)
is_timedelta = False
if arr.dtype.kind in "mM":
dtype = np.int64
arr = arr.view("i8")
na = iNaT
is_timedelta = True
elif is_bool:
# We have to cast in order to be able to hold np.nan
dtype = np.object_
elif dtype.kind in "iu":
# We have to cast in order to be able to hold np.nan
# int8, int16 are incompatible with float64,
# see https://github.com/cython/cython/issues/2646
if arr.dtype.name in ["int8", "int16"]:
dtype = np.float32
else:
dtype = np.float64
orig_ndim = arr.ndim
if orig_ndim == 1:
# reshape so we can always use algos.diff_2d
arr = arr.reshape(-1, 1)
# TODO: require axis == 0
dtype = np.dtype(dtype)
out_arr = np.empty(arr.shape, dtype=dtype)
na_indexer = [slice(None)] * 2
na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
out_arr[tuple(na_indexer)] = na
if arr.dtype.name in _diff_special:
# TODO: can diff_2d dtype specialization troubles be fixed by defining
# out_arr inside diff_2d?
algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
else:
# To keep mypy happy, _res_indexer is a list while res_indexer is
# a tuple, ditto for lag_indexer.
_res_indexer = [slice(None)] * 2
_res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
res_indexer = tuple(_res_indexer)
_lag_indexer = [slice(None)] * 2
_lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
lag_indexer = tuple(_lag_indexer)
out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
if is_timedelta:
out_arr = out_arr.view("timedelta64[ns]")
if orig_ndim == 1:
out_arr = out_arr[:, 0]
return out_arr
# --------------------------------------------------------------------
# Helper functions
# Note: safe_sort is in algorithms.py instead of sorting.py because it is
# low-dependency, is used in this module, and used private methods from
# this module.
def safe_sort(
values: Index | ArrayLike,
codes: npt.NDArray[np.intp] | None = None,
use_na_sentinel: bool = True,
assume_unique: bool = False,
verify: bool = True,
) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
"""
Sort ``values`` and reorder corresponding ``codes``.
``values`` should be unique if ``codes`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.
Parameters
----------
values : list-like
Sequence; must be unique if ``codes`` is not None.
codes : np.ndarray[intp] or None, default None
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``-1``.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``codes`` is None.
verify : bool, default True
Check if codes are out of bound for the values and put out of bound
codes equal to ``-1``. If ``verify=False``, it is assumed there
are no out of bound codes. Ignored when ``codes`` is None.
Returns
-------
ordered : AnyArrayLike
Sorted ``values``
new_codes : ndarray
Reordered ``codes``; returned when ``codes`` is not None.
Raises
------
TypeError
* If ``values`` is not list-like or if ``codes`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``codes`` is not None and ``values`` contain duplicates.
"""
if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCIndex)):
raise TypeError(
"Only np.ndarray, ExtensionArray, and Index objects are allowed to "
"be passed to safe_sort as values"
)
sorter = None
ordered: AnyArrayLike
if (
not isinstance(values.dtype, ExtensionDtype)
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
):
ordered = _sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except (TypeError, decimal.InvalidOperation):
# Previous sorters failed or were not applicable, try `_sort_mixed`
# which would work, but which fails for special case of 1d arrays
# with tuples.
if values.size and isinstance(values[0], tuple):
# error: Argument 1 to "_sort_tuples" has incompatible type
# "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected
# "ndarray[Any, Any]"
ordered = _sort_tuples(values) # type: ignore[arg-type]
else:
ordered = _sort_mixed(values)
# codes:
if codes is None:
return ordered
if not is_list_like(codes):
raise TypeError(
"Only list-like objects or None are allowed to "
"be passed to safe_sort as codes"
)
codes = ensure_platform_int(np.asarray(codes))
if not assume_unique and not len(unique(values)) == len(values):
raise ValueError("values should be unique if codes is not None")
if sorter is None:
# mixed types
# error: Argument 1 to "_get_hashtable_algo" has incompatible type
# "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected
# "ndarray[Any, Any]"
hash_klass, values = _get_hashtable_algo(values) # type: ignore[arg-type]
t = hash_klass(len(values))
t.map_locations(values)
sorter = ensure_platform_int(t.lookup(ordered))
if use_na_sentinel:
# take_nd is faster, but only works for na_sentinels of -1
order2 = sorter.argsort()
if verify:
mask = (codes < -len(values)) | (codes >= len(values))
codes[mask] = 0
else:
mask = None
new_codes = take_nd(order2, codes, fill_value=-1)
else:
reverse_indexer = np.empty(len(sorter), dtype=int)
reverse_indexer.put(sorter, np.arange(len(sorter)))
# Out of bound indices will be masked with `-1` next, so we
# may deal with them here without performance loss using `mode='wrap'`
new_codes = reverse_indexer.take(codes, mode="wrap")
if use_na_sentinel:
mask = codes == -1
if verify:
mask = mask | (codes < -len(values)) | (codes >= len(values))
if use_na_sentinel and mask is not None:
np.putmask(new_codes, mask, -1)
return ordered, ensure_platform_int(new_codes)
def _sort_mixed(values) -> AnyArrayLike:
"""order ints before strings before nulls in 1d arrays"""
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
null_pos = np.array([isna(x) for x in values], dtype=bool)
num_pos = ~str_pos & ~null_pos
str_argsort = np.argsort(values[str_pos])
num_argsort = np.argsort(values[num_pos])
# convert boolean arrays to positional indices, then order by underlying values
str_locs = str_pos.nonzero()[0].take(str_argsort)
num_locs = num_pos.nonzero()[0].take(num_argsort)
null_locs = null_pos.nonzero()[0]
locs = np.concatenate([num_locs, str_locs, null_locs])
return values.take(locs)
def _sort_tuples(values: np.ndarray) -> np.ndarray:
"""
Convert array of tuples (1d) to array of arrays (2d).
We need to keep the columns separately as they contain different types and
nans (can't use `np.sort` as it may fail when str and nan are mixed in a
column as types cannot be compared).
"""
from pandas.core.internals.construction import to_arrays
from pandas.core.sorting import lexsort_indexer
arrays, _ = to_arrays(values, None)
indexer = lexsort_indexer(arrays, orders=True)
return values[indexer]
def union_with_duplicates(
lvals: ArrayLike | Index, rvals: ArrayLike | Index
) -> ArrayLike | Index:
"""
Extracts the union from lvals and rvals with respect to duplicates and nans in
both arrays.
Parameters
----------
lvals: np.ndarray or ExtensionArray
left values which is ordered in front.
rvals: np.ndarray or ExtensionArray
right values ordered after lvals.
Returns
-------
np.ndarray or ExtensionArray
Containing the unsorted union of both arrays.
Notes
-----
Caller is responsible for ensuring lvals.dtype == rvals.dtype.
"""
from pandas import Series
with warnings.catch_warnings():
# filter warning from object dtype inference; we will end up discarding
# the index here, so the deprecation does not affect the end result here.
warnings.filterwarnings(
"ignore",
"The behavior of value_counts with object-dtype is deprecated",
category=FutureWarning,
)
l_count = value_counts_internal(lvals, dropna=False)
r_count = value_counts_internal(rvals, dropna=False)
l_count, r_count = l_count.align(r_count, fill_value=0)
final_count = np.maximum(l_count.values, r_count.values)
final_count = Series(final_count, index=l_count.index, dtype="int", copy=False)
if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex):
unique_vals = lvals.append(rvals).unique()
else:
if isinstance(lvals, ABCIndex):
lvals = lvals._values
if isinstance(rvals, ABCIndex):
rvals = rvals._values
# error: List item 0 has incompatible type "Union[ExtensionArray,
# ndarray[Any, Any], Index]"; expected "Union[ExtensionArray,
# ndarray[Any, Any]]"
combined = concat_compat([lvals, rvals]) # type: ignore[list-item]
unique_vals = unique(combined)
unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
repeats = final_count.reindex(unique_vals).values
return np.repeat(unique_vals, repeats)
def map_array(
arr: ArrayLike,
mapper,
na_action: Literal["ignore"] | None = None,
convert: bool = True,
) -> np.ndarray | ExtensionArray | Index:
"""
Map values using an input mapping or function.
Parameters
----------
mapper : function, dict, or Series
Mapping correspondence.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NA values, without passing them to the
mapping correspondence.
convert : bool, default True
Try to find better dtype for elementwise function results. If
False, leave as dtype=object.
Returns
-------
Union[ndarray, Index, ExtensionArray]
The output of the mapping function applied to the array.
If the function returns a tuple with more than one element
a MultiIndex will be returned.
"""
if na_action not in (None, "ignore"):
msg = f"na_action must either be 'ignore' or None, {na_action} was passed"
raise ValueError(msg)
# we can fastpath dict/Series to an efficient map
# as we know that we are not going to have to yield
# python types
if is_dict_like(mapper):
if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
# If a dictionary subclass defines a default value method,
# convert mapper to a lookup function (GH #15999).
dict_with_default = mapper
mapper = lambda x: dict_with_default[
np.nan if isinstance(x, float) and np.isnan(x) else x
]
else:
# Dictionary does not have a default. Thus it's safe to
# convert to an Series for efficiency.
# we specify the keys here to handle the
# possibility that they are tuples
# The return value of mapping with an empty mapper is
# expected to be pd.Series(np.nan, ...). As np.nan is
# of dtype float64 the return value of this method should
# be float64 as well
from pandas import Series
if len(mapper) == 0:
mapper = Series(mapper, dtype=np.float64)
else:
mapper = Series(mapper)
if isinstance(mapper, ABCSeries):
if na_action == "ignore":
mapper = mapper[mapper.index.notna()]
# Since values were input this means we came from either
# a dict or a series and mapper should be an index
indexer = mapper.index.get_indexer(arr)
new_values = take_nd(mapper._values, indexer)
return new_values
if not len(arr):
return arr.copy()
# we must convert to python types
values = arr.astype(object, copy=False)
if na_action is None:
return lib.map_infer(values, mapper, convert=convert)
else:
return lib.map_infer_mask(
values, mapper, mask=isna(values).view(np.uint8), convert=convert
)
Sindbad File Manager Version 1.0, Coded By Sindbad EG ~ The Terrorists