summaryrefslogtreecommitdiff
path: root/numpy/lib/function_base.py
diff options
context:
space:
mode:
authorEric Wieser <wieser.eric@gmail.com>2017-10-18 23:46:39 -0700
committerEric Wieser <wieser.eric@gmail.com>2017-10-19 23:04:32 -0700
commit57d225c11bb253981639d52442384eb3e43bb5f6 (patch)
tree494cdc7eccca1c9476ff5024468838c3354eef93 /numpy/lib/function_base.py
parente657629bbc2bfb880a1b2fa24a39c5921c1f965e (diff)
downloadpython-numpy-57d225c11bb253981639d52442384eb3e43bb5f6.tar.gz
python-numpy-57d225c11bb253981639d52442384eb3e43bb5f6.tar.bz2
python-numpy-57d225c11bb253981639d52442384eb3e43bb5f6.zip
MAINT: Tidy np.histogram, and improve error messages
Split up the overloaded `bins` variable into separate names depending on its meaning Helpful errors are now emitted for: * non-integer bin counts (fixes gh-8072) * non-1d bin edges Removes another use of `np.isscalar`...
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r--numpy/lib/function_base.py136
1 files changed, 77 insertions, 59 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 2745b49d1..1de4e906c 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4,6 +4,7 @@ import collections
import re
import sys
import warnings
+import operator
import numpy as np
import numpy.core.numeric as _nx
@@ -646,7 +647,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
a = asarray(a)
if weights is not None:
weights = asarray(weights)
- if np.any(weights.shape != a.shape):
+ if weights.shape != a.shape:
raise ValueError(
'weights should have the same shape as a.')
weights = weights.ravel()
@@ -671,11 +672,21 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
mn -= 0.5
mx += 0.5
+ # density overrides the normed keyword
+ if density is not None:
+ normed = False
+
+ # parse the overloaded bins argument
+ n_equal_bins = None
+ bin_edges = None
+
if isinstance(bins, basestring):
+ bin_name = bins
# if `bins` is a string for an automatic method,
# this will replace it with the number of bins calculated
- if bins not in _hist_bin_selectors:
- raise ValueError("{0} not a valid estimator for bins".format(bins))
+ if bin_name not in _hist_bin_selectors:
+ raise ValueError(
+ "{!r} is not a valid estimator for `bins`".format(bin_name))
if weights is not None:
raise TypeError("Automated estimation of the number of "
"bins is not supported for weighted data")
@@ -689,16 +700,40 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
b = a[keep]
if b.size == 0:
- bins = 1
+ n_equal_bins = 1
else:
# Do not call selectors on empty arrays
- width = _hist_bin_selectors[bins](b)
+ width = _hist_bin_selectors[bin_name](b)
if width:
- bins = int(np.ceil((mx - mn) / width))
+ n_equal_bins = int(np.ceil((mx - mn) / width))
else:
# Width can be zero for some estimators, e.g. FD when
# the IQR of the data is zero.
- bins = 1
+ n_equal_bins = 1
+
+ elif np.ndim(bins) == 0:
+ try:
+ n_equal_bins = operator.index(bins)
+ except TypeError:
+ raise TypeError(
+ '`bins` must be an integer, a string, or an array')
+ if n_equal_bins < 1:
+ raise ValueError('`bins` must be positive, when an integer')
+
+ elif np.ndim(bins) == 1:
+ bin_edges = np.asarray(bins)
+ if np.any(bin_edges[:-1] > bin_edges[1:]):
+ raise ValueError(
+ '`bins` must increase monotonically, when an array')
+
+ else:
+ raise ValueError('`bins` must be 1d, when an array')
+
+ del bins
+
+ # compute the bins if only the count was specified
+ if n_equal_bins is not None:
+ bin_edges = linspace(mn, mx, n_equal_bins + 1, endpoint=True)
# Histogram is an integer or a float array depending on the weights.
if weights is None:
@@ -710,27 +745,24 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
# computing histograms, to minimize memory usage.
BLOCK = 65536
- if not iterable(bins):
- if np.isscalar(bins) and bins < 1:
- raise ValueError(
- '`bins` should be a positive integer.')
- # At this point, if the weights are not integer, floating point, or
- # complex, we have to use the slow algorithm.
- if weights is not None and not (np.can_cast(weights.dtype, np.double) or
- np.can_cast(weights.dtype, complex)):
- bins = linspace(mn, mx, bins + 1, endpoint=True)
-
- if not iterable(bins):
+ # The fast path uses bincount, but that only works for certain types
+ # of weight
+ simple_weights = (
+ weights is None or
+ np.can_cast(weights.dtype, np.double) or
+ np.can_cast(weights.dtype, complex)
+ )
+
+ if n_equal_bins is not None and simple_weights:
+ # Fast algorithm for equal bins
# We now convert values of a to bin indices, under the assumption of
# equal bin widths (which is valid here).
# Initialize empty histogram
- n = np.zeros(bins, ntype)
- # Pre-compute histogram scaling factor
- norm = bins / (mx - mn)
+ n = np.zeros(n_equal_bins, ntype)
- # Compute the bin edges for potential correction.
- bin_edges = linspace(mn, mx, bins + 1, endpoint=True)
+ # Pre-compute histogram scaling factor
+ norm = n_equal_bins / (mx - mn)
# We iterate over blocks here for two reasons: the first is that for
# large arrays, it is actually faster (for example for a 10^8 array it
@@ -757,7 +789,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
# Compute the bin indices, and for values that lie exactly on mx we
# need to subtract one
indices = tmp_a.astype(np.intp)
- indices[indices == bins] -= 1
+ indices[indices == n_equal_bins] -= 1
# The index computation is not guaranteed to give exactly
# consistent results within ~1 ULP of the bin edges.
@@ -765,35 +797,26 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
indices[decrement] -= 1
# The last bin includes the right edge. The other bins do not.
increment = ((tmp_a_data >= bin_edges[indices + 1])
- & (indices != bins - 1))
+ & (indices != n_equal_bins - 1))
indices[increment] += 1
# We now compute the histogram using bincount
if ntype.kind == 'c':
n.real += np.bincount(indices, weights=tmp_w.real,
- minlength=bins)
+ minlength=n_equal_bins)
n.imag += np.bincount(indices, weights=tmp_w.imag,
- minlength=bins)
+ minlength=n_equal_bins)
else:
n += np.bincount(indices, weights=tmp_w,
- minlength=bins).astype(ntype)
-
- # Rename the bin edges for return.
- bins = bin_edges
+ minlength=n_equal_bins).astype(ntype)
else:
- bins = asarray(bins)
- if np.any(bins[:-1] > bins[1:]):
- raise ValueError(
- 'bins must increase monotonically.')
-
- # Initialize empty histogram
- n = np.zeros(bins.shape, ntype)
-
+ # Compute via cumulative histogram
+ cum_n = np.zeros(bin_edges.shape, ntype)
if weights is None:
for i in arange(0, len(a), BLOCK):
sa = sort(a[i:i+BLOCK])
- n += np.r_[sa.searchsorted(bins[:-1], 'left'),
- sa.searchsorted(bins[-1], 'right')]
+ cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+ sa.searchsorted(bin_edges[-1], 'right')]
else:
zero = array(0, dtype=ntype)
for i in arange(0, len(a), BLOCK):
@@ -802,27 +825,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
sorting_index = np.argsort(tmp_a)
sa = tmp_a[sorting_index]
sw = tmp_w[sorting_index]
- cw = np.concatenate(([zero, ], sw.cumsum()))
- bin_index = np.r_[sa.searchsorted(bins[:-1], 'left'),
- sa.searchsorted(bins[-1], 'right')]
- n += cw[bin_index]
-
+ cw = np.concatenate(([zero], sw.cumsum()))
+ bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+ sa.searchsorted(bin_edges[-1], 'right')]
+ cum_n += cw[bin_index]
- n = np.diff(n)
+ n = np.diff(cum_n)
- if density is not None:
- if density:
- db = array(np.diff(bins), float)
- return n/db/n.sum(), bins
- else:
- return n, bins
- else:
+ if density:
+ db = array(np.diff(bin_edges), float)
+ return n/db/n.sum(), bin_edges
+ elif normed:
# deprecated, buggy behavior. Remove for NumPy 2.0.0
- if normed:
- db = array(np.diff(bins), float)
- return n/(n*db).sum(), bins
- else:
- return n, bins
+ db = array(np.diff(bin_edges), float)
+ return n/(n*db).sum(), bin_edges
+ else:
+ return n, bin_edges
def histogramdd(sample, bins=10, range=None, normed=False, weights=None):