Sindbad~EG File Manager
"""
Experimental manager based on storing a collection of 1D arrays
"""
from __future__ import annotations
import itertools
from typing import (
TYPE_CHECKING,
Callable,
Literal,
)
import numpy as np
from pandas._libs import (
NaT,
lib,
)
from pandas.core.dtypes.astype import (
astype_array,
astype_array_safe,
)
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
infer_dtype_from_scalar,
np_find_common_type,
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_datetime64_ns_dtype,
is_integer,
is_numeric_dtype,
is_object_dtype,
is_timedelta64_ns_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.missing import (
array_equals,
isna,
na_value_for_dtype,
)
import pandas.core.algorithms as algos
from pandas.core.array_algos.quantile import quantile_compat
from pandas.core.array_algos.take import take_1d
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
NumpyExtensionArray,
TimedeltaArray,
)
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
from pandas.core.indexers import (
maybe_convert_indices,
validate_indices,
)
from pandas.core.indexes.api import (
Index,
ensure_index,
)
from pandas.core.indexes.base import get_values_for_csv
from pandas.core.internals.base import (
DataManager,
SingleDataManager,
ensure_np_dtype,
interleaved_dtype,
)
from pandas.core.internals.blocks import (
BlockPlacement,
ensure_block_shape,
external_values,
extract_pandas_array,
maybe_coerce_values,
new_block,
)
from pandas.core.internals.managers import make_na_array
if TYPE_CHECKING:
from collections.abc import Hashable
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
QuantileInterpolation,
Self,
npt,
)
class BaseArrayManager(DataManager):
"""
Core internal data structure to implement DataFrame and Series.
Alternative to the BlockManager, storing a list of 1D arrays instead of
Blocks.
This is *not* a public API class
Parameters
----------
arrays : Sequence of arrays
axes : Sequence of Index
verify_integrity : bool, default True
"""
__slots__ = [
"_axes", # private attribute, because 'axes' has different order, see below
"arrays",
]
arrays: list[np.ndarray | ExtensionArray]
_axes: list[Index]
def __init__(
self,
arrays: list[np.ndarray | ExtensionArray],
axes: list[Index],
verify_integrity: bool = True,
) -> None:
raise NotImplementedError
def make_empty(self, axes=None) -> Self:
"""Return an empty ArrayManager with the items axis of len 0 (no columns)"""
if axes is None:
axes = [self.axes[1:], Index([])]
arrays: list[np.ndarray | ExtensionArray] = []
return type(self)(arrays, axes)
@property
def items(self) -> Index:
return self._axes[-1]
@property
# error: Signature of "axes" incompatible with supertype "DataManager"
def axes(self) -> list[Index]: # type: ignore[override]
# mypy doesn't work to override attribute with property
# see https://github.com/python/mypy/issues/4125
"""Axes is BlockManager-compatible order (columns, rows)"""
return [self._axes[1], self._axes[0]]
@property
def shape_proper(self) -> tuple[int, ...]:
# this returns (n_rows, n_columns)
return tuple(len(ax) for ax in self._axes)
@staticmethod
def _normalize_axis(axis: AxisInt) -> int:
# switch axis
axis = 1 if axis == 0 else 0
return axis
def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
# Caller is responsible for ensuring we have an Index object.
self._validate_set_axis(axis, new_labels)
axis = self._normalize_axis(axis)
self._axes[axis] = new_labels
def get_dtypes(self) -> npt.NDArray[np.object_]:
return np.array([arr.dtype for arr in self.arrays], dtype="object")
def add_references(self, mgr: BaseArrayManager) -> None:
"""
Only implemented on the BlockManager level
"""
return
def __getstate__(self):
return self.arrays, self._axes
def __setstate__(self, state) -> None:
self.arrays = state[0]
self._axes = state[1]
def __repr__(self) -> str:
output = type(self).__name__
output += f"\nIndex: {self._axes[0]}"
if self.ndim == 2:
output += f"\nColumns: {self._axes[1]}"
output += f"\n{len(self.arrays)} arrays:"
for arr in self.arrays:
output += f"\n{arr.dtype}"
return output
def apply(
self,
f,
align_keys: list[str] | None = None,
**kwargs,
) -> Self:
"""
Iterate over the arrays, collect and create a new ArrayManager.
Parameters
----------
f : str or callable
Name of the Array method to apply.
align_keys: List[str] or None, default None
**kwargs
Keywords to pass to `f`
Returns
-------
ArrayManager
"""
assert "filter" not in kwargs
align_keys = align_keys or []
result_arrays: list[ArrayLike] = []
# fillna: Series/DataFrame is responsible for making sure value is aligned
aligned_args = {k: kwargs[k] for k in align_keys}
if f == "apply":
f = kwargs.pop("func")
for i, arr in enumerate(self.arrays):
if aligned_args:
for k, obj in aligned_args.items():
if isinstance(obj, (ABCSeries, ABCDataFrame)):
# The caller is responsible for ensuring that
# obj.axes[-1].equals(self.items)
if obj.ndim == 1:
kwargs[k] = obj.iloc[i]
else:
kwargs[k] = obj.iloc[:, i]._values
else:
# otherwise we have an array-like
kwargs[k] = obj[i]
if callable(f):
applied = f(arr, **kwargs)
else:
applied = getattr(arr, f)(**kwargs)
result_arrays.append(applied)
new_axes = self._axes
return type(self)(result_arrays, new_axes)
def apply_with_block(self, f, align_keys=None, **kwargs) -> Self:
# switch axis to follow BlockManager logic
swap_axis = True
if f == "interpolate":
swap_axis = False
if swap_axis and "axis" in kwargs and self.ndim == 2:
kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0
align_keys = align_keys or []
aligned_args = {k: kwargs[k] for k in align_keys}
result_arrays = []
for i, arr in enumerate(self.arrays):
if aligned_args:
for k, obj in aligned_args.items():
if isinstance(obj, (ABCSeries, ABCDataFrame)):
# The caller is responsible for ensuring that
# obj.axes[-1].equals(self.items)
if obj.ndim == 1:
if self.ndim == 2:
kwargs[k] = obj.iloc[slice(i, i + 1)]._values
else:
kwargs[k] = obj.iloc[:]._values
else:
kwargs[k] = obj.iloc[:, [i]]._values
else:
# otherwise we have an ndarray
if obj.ndim == 2:
kwargs[k] = obj[[i]]
if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray):
# i.e. TimedeltaArray, DatetimeArray with tz=None. Need to
# convert for the Block constructors.
arr = np.asarray(arr)
arr = maybe_coerce_values(arr)
if self.ndim == 2:
arr = ensure_block_shape(arr, 2)
bp = BlockPlacement(slice(0, 1, 1))
block = new_block(arr, placement=bp, ndim=2)
else:
bp = BlockPlacement(slice(0, len(self), 1))
block = new_block(arr, placement=bp, ndim=1)
applied = getattr(block, f)(**kwargs)
if isinstance(applied, list):
applied = applied[0]
arr = applied.values
if self.ndim == 2 and arr.ndim == 2:
# 2D for np.ndarray or DatetimeArray/TimedeltaArray
assert len(arr) == 1
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[int, slice]"
arr = arr[0, :] # type: ignore[call-overload]
result_arrays.append(arr)
return type(self)(result_arrays, self._axes)
def setitem(self, indexer, value, warn: bool = True) -> Self:
return self.apply_with_block("setitem", indexer=indexer, value=value)
def diff(self, n: int) -> Self:
assert self.ndim == 2 # caller ensures
return self.apply(algos.diff, n=n)
def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self:
if copy is None:
copy = True
return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
def convert(self, copy: bool | None) -> Self:
if copy is None:
copy = True
def _convert(arr):
if is_object_dtype(arr.dtype):
# extract NumpyExtensionArray for tests that patch
# NumpyExtensionArray._typ
arr = np.asarray(arr)
result = lib.maybe_convert_objects(
arr,
convert_non_numeric=True,
)
if result is arr and copy:
return arr.copy()
return result
else:
return arr.copy() if copy else arr
return self.apply(_convert)
def get_values_for_csv(
self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None
) -> Self:
return self.apply(
get_values_for_csv,
na_rep=na_rep,
quoting=quoting,
float_format=float_format,
date_format=date_format,
decimal=decimal,
)
@property
def any_extension_types(self) -> bool:
"""Whether any of the blocks in this manager are extension blocks"""
return False # any(block.is_extension for block in self.blocks)
@property
def is_view(self) -> bool:
"""return a boolean if we are a single block and are a view"""
# TODO what is this used for?
return False
@property
def is_single_block(self) -> bool:
return len(self.arrays) == 1
def _get_data_subset(self, predicate: Callable) -> Self:
indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
arrays = [self.arrays[i] for i in indices]
# TODO copy?
# Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq,
# see test_describe_datetime_columns
taker = np.array(indices, dtype="intp")
new_cols = self._axes[1].take(taker)
new_axes = [self._axes[0], new_cols]
return type(self)(arrays, new_axes, verify_integrity=False)
def get_bool_data(self, copy: bool = False) -> Self:
"""
Select columns that are bool-dtype and object-dtype columns that are all-bool.
Parameters
----------
copy : bool, default False
Whether to copy the blocks
"""
return self._get_data_subset(lambda x: x.dtype == np.dtype(bool))
def get_numeric_data(self, copy: bool = False) -> Self:
"""
Select columns that have a numeric dtype.
Parameters
----------
copy : bool, default False
Whether to copy the blocks
"""
return self._get_data_subset(
lambda arr: is_numeric_dtype(arr.dtype)
or getattr(arr.dtype, "_is_numeric", False)
)
def copy(self, deep: bool | Literal["all"] | None = True) -> Self:
"""
Make deep or shallow copy of ArrayManager
Parameters
----------
deep : bool or string, default True
If False, return shallow copy (do not copy data)
If 'all', copy data and a deep copy of the index
Returns
-------
BlockManager
"""
if deep is None:
# ArrayManager does not yet support CoW, so deep=None always means
# deep=True for now
deep = True
# this preserves the notion of view copying of axes
if deep:
# hit in e.g. tests.io.json.test_pandas
def copy_func(ax):
return ax.copy(deep=True) if deep == "all" else ax.view()
new_axes = [copy_func(ax) for ax in self._axes]
else:
new_axes = list(self._axes)
if deep:
new_arrays = [arr.copy() for arr in self.arrays]
else:
new_arrays = list(self.arrays)
return type(self)(new_arrays, new_axes, verify_integrity=False)
def reindex_indexer(
self,
new_axis,
indexer,
axis: AxisInt,
fill_value=None,
allow_dups: bool = False,
copy: bool | None = True,
# ignored keywords
only_slice: bool = False,
# ArrayManager specific keywords
use_na_proxy: bool = False,
) -> Self:
axis = self._normalize_axis(axis)
return self._reindex_indexer(
new_axis,
indexer,
axis,
fill_value,
allow_dups,
copy,
use_na_proxy,
)
def _reindex_indexer(
self,
new_axis,
indexer: npt.NDArray[np.intp] | None,
axis: AxisInt,
fill_value=None,
allow_dups: bool = False,
copy: bool | None = True,
use_na_proxy: bool = False,
) -> Self:
"""
Parameters
----------
new_axis : Index
indexer : ndarray[intp] or None
axis : int
fill_value : object, default None
allow_dups : bool, default False
copy : bool, default True
pandas-indexer with -1's only.
"""
if copy is None:
# ArrayManager does not yet support CoW, so deep=None always means
# deep=True for now
copy = True
if indexer is None:
if new_axis is self._axes[axis] and not copy:
return self
result = self.copy(deep=copy)
result._axes = list(self._axes)
result._axes[axis] = new_axis
return result
# some axes don't allow reindexing with dups
if not allow_dups:
self._axes[axis]._validate_can_reindex(indexer)
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
if axis == 1:
new_arrays = []
for i in indexer:
if i == -1:
arr = self._make_na_array(
fill_value=fill_value, use_na_proxy=use_na_proxy
)
else:
arr = self.arrays[i]
if copy:
arr = arr.copy()
new_arrays.append(arr)
else:
validate_indices(indexer, len(self._axes[0]))
indexer = ensure_platform_int(indexer)
mask = indexer == -1
needs_masking = mask.any()
new_arrays = [
take_1d(
arr,
indexer,
allow_fill=needs_masking,
fill_value=fill_value,
mask=mask,
# if fill_value is not None else blk.fill_value
)
for arr in self.arrays
]
new_axes = list(self._axes)
new_axes[axis] = new_axis
return type(self)(new_arrays, new_axes, verify_integrity=False)
def take(
self,
indexer: npt.NDArray[np.intp],
axis: AxisInt = 1,
verify: bool = True,
) -> Self:
"""
Take items along any axis.
"""
assert isinstance(indexer, np.ndarray), type(indexer)
assert indexer.dtype == np.intp, indexer.dtype
axis = self._normalize_axis(axis)
if not indexer.ndim == 1:
raise ValueError("indexer should be 1-dimensional")
n = self.shape_proper[axis]
indexer = maybe_convert_indices(indexer, n, verify=verify)
new_labels = self._axes[axis].take(indexer)
return self._reindex_indexer(
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
)
def _make_na_array(self, fill_value=None, use_na_proxy: bool = False):
if use_na_proxy:
assert fill_value is None
return NullArrayProxy(self.shape_proper[0])
if fill_value is None:
fill_value = np.nan
dtype, fill_value = infer_dtype_from_scalar(fill_value)
array_values = make_na_array(dtype, self.shape_proper[:1], fill_value)
return array_values
def _equal_values(self, other) -> bool:
"""
Used in .equals defined in base class. Only check the column values
assuming shape and indexes have already been checked.
"""
for left, right in zip(self.arrays, other.arrays):
if not array_equals(left, right):
return False
return True
# TODO
# to_dict
class ArrayManager(BaseArrayManager):
@property
def ndim(self) -> Literal[2]:
return 2
def __init__(
self,
arrays: list[np.ndarray | ExtensionArray],
axes: list[Index],
verify_integrity: bool = True,
) -> None:
# Note: we are storing the axes in "_axes" in the (row, columns) order
# which contrasts the order how it is stored in BlockManager
self._axes = axes
self.arrays = arrays
if verify_integrity:
self._axes = [ensure_index(ax) for ax in axes]
arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays]
self.arrays = [maybe_coerce_values(arr) for arr in arrays]
self._verify_integrity()
def _verify_integrity(self) -> None:
n_rows, n_columns = self.shape_proper
if not len(self.arrays) == n_columns:
raise ValueError(
"Number of passed arrays must equal the size of the column Index: "
f"{len(self.arrays)} arrays vs {n_columns} columns."
)
for arr in self.arrays:
if not len(arr) == n_rows:
raise ValueError(
"Passed arrays should have the same length as the rows Index: "
f"{len(arr)} vs {n_rows} rows"
)
if not isinstance(arr, (np.ndarray, ExtensionArray)):
raise ValueError(
"Passed arrays should be np.ndarray or ExtensionArray instances, "
f"got {type(arr)} instead"
)
if not arr.ndim == 1:
raise ValueError(
"Passed arrays should be 1-dimensional, got array with "
f"{arr.ndim} dimensions instead."
)
# --------------------------------------------------------------------
# Indexing
def fast_xs(self, loc: int) -> SingleArrayManager:
"""
Return the array corresponding to `frame.iloc[loc]`.
Parameters
----------
loc : int
Returns
-------
np.ndarray or ExtensionArray
"""
dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
values = [arr[loc] for arr in self.arrays]
if isinstance(dtype, ExtensionDtype):
result = dtype.construct_array_type()._from_sequence(values, dtype=dtype)
# for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT
elif is_datetime64_ns_dtype(dtype):
result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray
elif is_timedelta64_ns_dtype(dtype):
result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray
else:
result = np.array(values, dtype=dtype)
return SingleArrayManager([result], [self._axes[1]])
def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
axis = self._normalize_axis(axis)
if axis == 0:
arrays = [arr[slobj] for arr in self.arrays]
elif axis == 1:
arrays = self.arrays[slobj]
new_axes = list(self._axes)
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
return type(self)(arrays, new_axes, verify_integrity=False)
def iget(self, i: int) -> SingleArrayManager:
"""
Return the data as a SingleArrayManager.
"""
values = self.arrays[i]
return SingleArrayManager([values], [self._axes[0]])
def iget_values(self, i: int) -> ArrayLike:
"""
Return the data for column i as the values (ndarray or ExtensionArray).
"""
return self.arrays[i]
@property
def column_arrays(self) -> list[ArrayLike]:
"""
Used in the JSON C code to access column arrays.
"""
return [np.asarray(arr) for arr in self.arrays]
def iset(
self,
loc: int | slice | np.ndarray,
value: ArrayLike,
inplace: bool = False,
refs=None,
) -> None:
"""
Set new column(s).
This changes the ArrayManager in-place, but replaces (an) existing
column(s), not changing column values in-place).
Parameters
----------
loc : integer, slice or boolean mask
Positional location (already bounds checked)
value : np.ndarray or ExtensionArray
inplace : bool, default False
Whether overwrite existing array as opposed to replacing it.
"""
# single column -> single integer index
if lib.is_integer(loc):
# TODO can we avoid needing to unpack this here? That means converting
# DataFrame into 1D array when loc is an integer
if isinstance(value, np.ndarray) and value.ndim == 2:
assert value.shape[1] == 1
value = value[:, 0]
# TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
# but we should avoid that and pass directly the proper array
value = maybe_coerce_values(value)
assert isinstance(value, (np.ndarray, ExtensionArray))
assert value.ndim == 1
assert len(value) == len(self._axes[0])
self.arrays[loc] = value
return
# multiple columns -> convert slice or array to integer indices
elif isinstance(loc, slice):
indices: range | np.ndarray = range(
loc.start if loc.start is not None else 0,
loc.stop if loc.stop is not None else self.shape_proper[1],
loc.step if loc.step is not None else 1,
)
else:
assert isinstance(loc, np.ndarray)
assert loc.dtype == "bool"
indices = np.nonzero(loc)[0]
assert value.ndim == 2
assert value.shape[0] == len(self._axes[0])
for value_idx, mgr_idx in enumerate(indices):
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[slice, int]"
value_arr = value[:, value_idx] # type: ignore[call-overload]
self.arrays[mgr_idx] = value_arr
return
def column_setitem(
self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
) -> None:
"""
Set values ("setitem") into a single column (not setting the full column).
This is a method on the ArrayManager level, to avoid creating an
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
"""
if not is_integer(loc):
raise TypeError("The column index should be an integer")
arr = self.arrays[loc]
mgr = SingleArrayManager([arr], [self._axes[0]])
if inplace_only:
mgr.setitem_inplace(idx, value)
else:
new_mgr = mgr.setitem((idx,), value)
# update existing ArrayManager in-place
self.arrays[loc] = new_mgr.arrays[0]
def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
"""
Insert item at selected position.
Parameters
----------
loc : int
item : hashable
value : np.ndarray or ExtensionArray
"""
# insert to the axis; this could possibly raise a TypeError
new_axis = self.items.insert(loc, item)
value = extract_array(value, extract_numpy=True)
if value.ndim == 2:
if value.shape[0] == 1:
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[int, slice]"
value = value[0, :] # type: ignore[call-overload]
else:
raise ValueError(
f"Expected a 1D array, got an array with shape {value.shape}"
)
value = maybe_coerce_values(value)
# TODO self.arrays can be empty
# assert len(value) == len(self.arrays[0])
# TODO is this copy needed?
arrays = self.arrays.copy()
arrays.insert(loc, value)
self.arrays = arrays
self._axes[1] = new_axis
def idelete(self, indexer) -> ArrayManager:
"""
Delete selected locations in-place (new block and array, same BlockManager)
"""
to_keep = np.ones(self.shape[0], dtype=np.bool_)
to_keep[indexer] = False
self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]]
self._axes = [self._axes[0], self._axes[1][to_keep]]
return self
# --------------------------------------------------------------------
# Array-wise Operation
def grouped_reduce(self, func: Callable) -> Self:
"""
Apply grouped reduction function columnwise, returning a new ArrayManager.
Parameters
----------
func : grouped reduction function
Returns
-------
ArrayManager
"""
result_arrays: list[np.ndarray] = []
result_indices: list[int] = []
for i, arr in enumerate(self.arrays):
# grouped_reduce functions all expect 2D arrays
arr = ensure_block_shape(arr, ndim=2)
res = func(arr)
if res.ndim == 2:
# reverse of ensure_block_shape
assert res.shape[0] == 1
res = res[0]
result_arrays.append(res)
result_indices.append(i)
if len(result_arrays) == 0:
nrows = 0
else:
nrows = result_arrays[0].shape[0]
index = Index(range(nrows))
columns = self.items
# error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
# expected "List[Union[ndarray, ExtensionArray]]"
return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
def reduce(self, func: Callable) -> Self:
"""
Apply reduction function column-wise, returning a single-row ArrayManager.
Parameters
----------
func : reduction function
Returns
-------
ArrayManager
"""
result_arrays: list[np.ndarray] = []
for i, arr in enumerate(self.arrays):
res = func(arr, axis=0)
# TODO NaT doesn't preserve dtype, so we need to ensure to create
# a timedelta result array if original was timedelta
# what if datetime results in timedelta? (eg std)
dtype = arr.dtype if res is NaT else None
result_arrays.append(
sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
)
index = Index._simple_new(np.array([None], dtype=object)) # placeholder
columns = self.items
# error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
# expected "List[Union[ndarray, ExtensionArray]]"
new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
return new_mgr
def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
"""
Apply array_op blockwise with another (aligned) BlockManager.
"""
# TODO what if `other` is BlockManager ?
left_arrays = self.arrays
right_arrays = other.arrays
result_arrays = [
array_op(left, right) for left, right in zip(left_arrays, right_arrays)
]
return type(self)(result_arrays, self._axes)
def quantile(
self,
*,
qs: Index, # with dtype float64
transposed: bool = False,
interpolation: QuantileInterpolation = "linear",
) -> ArrayManager:
arrs = [ensure_block_shape(x, 2) for x in self.arrays]
new_arrs = [
quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs
]
for i, arr in enumerate(new_arrs):
if arr.ndim == 2:
assert arr.shape[0] == 1, arr.shape
new_arrs[i] = arr[0]
axes = [qs, self._axes[1]]
return type(self)(new_arrs, axes)
# ----------------------------------------------------------------
def unstack(self, unstacker, fill_value) -> ArrayManager:
"""
Return a BlockManager with all blocks unstacked.
Parameters
----------
unstacker : reshape._Unstacker
fill_value : Any
fill_value for newly introduced missing values.
Returns
-------
unstacked : BlockManager
"""
indexer, _ = unstacker._indexer_and_to_sort
if unstacker.mask.all():
new_indexer = indexer
allow_fill = False
new_mask2D = None
needs_masking = None
else:
new_indexer = np.full(unstacker.mask.shape, -1)
new_indexer[unstacker.mask] = indexer
allow_fill = True
# calculating the full mask once and passing it to take_1d is faster
# than letting take_1d calculate it in each repeated call
new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
needs_masking = new_mask2D.any(axis=0)
new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
new_indexer2D = ensure_platform_int(new_indexer2D)
new_arrays = []
for arr in self.arrays:
for i in range(unstacker.full_shape[1]):
if allow_fill:
# error: Value of type "Optional[Any]" is not indexable [index]
new_arr = take_1d(
arr,
new_indexer2D[:, i],
allow_fill=needs_masking[i], # type: ignore[index]
fill_value=fill_value,
mask=new_mask2D[:, i], # type: ignore[index]
)
else:
new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False)
new_arrays.append(new_arr)
new_index = unstacker.new_index
new_columns = unstacker.get_new_columns(self._axes[1])
new_axes = [new_index, new_columns]
return type(self)(new_arrays, new_axes, verify_integrity=False)
def as_array(
self,
dtype=None,
copy: bool = False,
na_value: object = lib.no_default,
) -> np.ndarray:
"""
Convert the blockmanager data into an numpy array.
Parameters
----------
dtype : object, default None
Data type of the return array.
copy : bool, default False
If True then guarantee that a copy is returned. A value of
False does not guarantee that the underlying data is not
copied.
na_value : object, default lib.no_default
Value to be used as the missing value sentinel.
Returns
-------
arr : ndarray
"""
if len(self.arrays) == 0:
empty_arr = np.empty(self.shape, dtype=float)
return empty_arr.transpose()
# We want to copy when na_value is provided to avoid
# mutating the original object
copy = copy or na_value is not lib.no_default
if not dtype:
dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
dtype = ensure_np_dtype(dtype)
result = np.empty(self.shape_proper, dtype=dtype)
for i, arr in enumerate(self.arrays):
arr = arr.astype(dtype, copy=copy)
result[:, i] = arr
if na_value is not lib.no_default:
result[isna(result)] = na_value
return result
@classmethod
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed ArrayManagers horizontally.
"""
# concatting along the columns -> combine reindexed arrays in a single manager
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr
@classmethod
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed ArrayManagers vertically.
"""
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr
class SingleArrayManager(BaseArrayManager, SingleDataManager):
__slots__ = [
"_axes", # private attribute, because 'axes' has different order, see below
"arrays",
]
arrays: list[np.ndarray | ExtensionArray]
_axes: list[Index]
@property
def ndim(self) -> Literal[1]:
return 1
def __init__(
self,
arrays: list[np.ndarray | ExtensionArray],
axes: list[Index],
verify_integrity: bool = True,
) -> None:
self._axes = axes
self.arrays = arrays
if verify_integrity:
assert len(axes) == 1
assert len(arrays) == 1
self._axes = [ensure_index(ax) for ax in self._axes]
arr = arrays[0]
arr = maybe_coerce_values(arr)
arr = extract_pandas_array(arr, None, 1)[0]
self.arrays = [arr]
self._verify_integrity()
def _verify_integrity(self) -> None:
(n_rows,) = self.shape
assert len(self.arrays) == 1
arr = self.arrays[0]
assert len(arr) == n_rows
if not arr.ndim == 1:
raise ValueError(
"Passed array should be 1-dimensional, got array with "
f"{arr.ndim} dimensions instead."
)
@staticmethod
def _normalize_axis(axis):
return axis
def make_empty(self, axes=None) -> Self:
"""Return an empty ArrayManager with index/array of length 0"""
if axes is None:
axes = [Index([], dtype=object)]
array: np.ndarray = np.array([], dtype=self.dtype)
return type(self)([array], axes)
@classmethod
def from_array(cls, array, index) -> SingleArrayManager:
return cls([array], [index])
# error: Cannot override writeable attribute with read-only property
@property
def axes(self) -> list[Index]: # type: ignore[override]
return self._axes
@property
def index(self) -> Index:
return self._axes[0]
@property
def dtype(self):
return self.array.dtype
def external_values(self):
"""The array that Series.values returns"""
return external_values(self.array)
def internal_values(self):
"""The array that Series._values returns"""
return self.array
def array_values(self):
"""The array that Series.array returns"""
arr = self.array
if isinstance(arr, np.ndarray):
arr = NumpyExtensionArray(arr)
return arr
@property
def _can_hold_na(self) -> bool:
if isinstance(self.array, np.ndarray):
return self.array.dtype.kind not in "iub"
else:
# ExtensionArray
return self.array._can_hold_na
@property
def is_single_block(self) -> bool:
return True
def fast_xs(self, loc: int) -> SingleArrayManager:
raise NotImplementedError("Use series._values[loc] instead")
def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager:
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
new_array = self.array[slobj]
new_index = self.index._getitem_slice(slobj)
return type(self)([new_array], [new_index], verify_integrity=False)
def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> SingleArrayManager:
new_array = self.array[indexer]
new_index = self.index[indexer]
return type(self)([new_array], [new_index])
# error: Signature of "apply" incompatible with supertype "BaseArrayManager"
def apply(self, func, **kwargs) -> Self: # type: ignore[override]
if callable(func):
new_array = func(self.array, **kwargs)
else:
new_array = getattr(self.array, func)(**kwargs)
return type(self)([new_array], self._axes)
def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager:
"""
Set values with indexer.
For SingleArrayManager, this backs s[indexer] = value
See `setitem_inplace` for a version that works inplace and doesn't
return a new Manager.
"""
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")
return self.apply_with_block("setitem", indexer=indexer, value=value)
def idelete(self, indexer) -> SingleArrayManager:
"""
Delete selected locations in-place (new array, same ArrayManager)
"""
to_keep = np.ones(self.shape[0], dtype=np.bool_)
to_keep[indexer] = False
self.arrays = [self.arrays[0][to_keep]]
self._axes = [self._axes[0][to_keep]]
return self
def _get_data_subset(self, predicate: Callable) -> SingleArrayManager:
# used in get_numeric_data / get_bool_data
if predicate(self.array):
return type(self)(self.arrays, self._axes, verify_integrity=False)
else:
return self.make_empty()
def set_values(self, values: ArrayLike) -> None:
"""
Set (replace) the values of the SingleArrayManager in place.
Use at your own risk! This does not check if the passed values are
valid for the current SingleArrayManager (length, dtype, etc).
"""
self.arrays[0] = values
def to_2d_mgr(self, columns: Index) -> ArrayManager:
"""
Manager analogue of Series.to_frame
"""
arrays = [self.arrays[0]]
axes = [self.axes[0], columns]
return ArrayManager(arrays, axes, verify_integrity=False)
class NullArrayProxy:
"""
Proxy object for an all-NA array.
Only stores the length of the array, and not the dtype. The dtype
will only be known when actually concatenating (after determining the
common dtype, for which this proxy is ignored).
Using this object avoids that the internals/concat.py needs to determine
the proper dtype and array type.
"""
ndim = 1
def __init__(self, n: int) -> None:
self.n = n
@property
def shape(self) -> tuple[int]:
return (self.n,)
def to_array(self, dtype: DtypeObj) -> ArrayLike:
"""
Helper function to create the actual all-NA array from the NullArrayProxy
object.
Parameters
----------
arr : NullArrayProxy
dtype : the dtype for the resulting array
Returns
-------
np.ndarray or ExtensionArray
"""
if isinstance(dtype, ExtensionDtype):
empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
indexer = -np.ones(self.n, dtype=np.intp)
return empty.take(indexer, allow_fill=True)
else:
# when introducing missing values, int becomes float, bool becomes object
dtype = ensure_dtype_can_hold_na(dtype)
fill_value = na_value_for_dtype(dtype)
arr = np.empty(self.n, dtype=dtype)
arr.fill(fill_value)
return ensure_wrapped_if_datetimelike(arr)
def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.
Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.
Parameters
----------
to_concat : list of arrays
Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1
if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(lib.is_np_dtype(x, "iub") for x in dtypes):
# GH#42092
target_dtype = np_find_common_type(*dtypes)
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else astype_array(arr, target_dtype, copy=False)
for arr in to_concat
]
if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)
result = np.concatenate(to_concat)
# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
Sindbad File Manager Version 1.0, Coded By Sindbad EG ~ The Terrorists