summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2017-12-22 10:07:04 -0700
committerGitHub <noreply@github.com>2017-12-22 10:07:04 -0700
commit0bcc57a4dbfd186b252dbdd73ae543da57fd81e4 (patch)
treee1375cf7acbed890358a9a1e619c5d58d7342824
parent4655d5b96e16a8ac865a4ab61f049c40c30fbbc2 (diff)
parentccbe65d126c72444779b269f3e03deb464bc0eee (diff)
downloadpython-numpy-0bcc57a4dbfd186b252dbdd73ae543da57fd81e4.tar.gz
python-numpy-0bcc57a4dbfd186b252dbdd73ae543da57fd81e4.tar.bz2
python-numpy-0bcc57a4dbfd186b252dbdd73ae543da57fd81e4.zip
Merge pull request #10186 from eric-wieser/move_histogram
MAINT: Move histogram and histogramdd into their own module
-rw-r--r--doc/release/1.15.0-notes.rst9
-rw-r--r--numpy/lib/__init__.py2
-rw-r--r--numpy/lib/function_base.py804
-rw-r--r--numpy/lib/histograms.py811
-rw-r--r--numpy/lib/tests/test_function_base.py512
-rw-r--r--numpy/lib/tests/test_histograms.py527
6 files changed, 1352 insertions, 1313 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index 8f2de1fda..a01f897ef 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -55,6 +55,15 @@ builtin arbitrary-precision `Decimal` and `long` types.
Improvements
============
+``histogram`` and ``histogramdd` functions have moved to ``np.lib.histograms``
+------------------------------------------------------------------------------
+These were originally found in ``np.lib.function_base``. They are still
+available under their un-scoped ``np.histogram(dd)`` names, and
+to maintain compatibility, aliased at ``np.lib.function_base.histogram(dd)``.
+
+Code that does ``from np.lib.function_base import *`` will need to be updated
+with the new location, and should consider not using ``import *`` in future.
+
``MaskedArray.astype`` now is identical to ``ndarray.astype``
-------------------------------------------------------------
This means it takes all the same arguments, making more code written for
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index d85a179dd..cc05232a2 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -14,6 +14,7 @@ from .shape_base import *
from .stride_tricks import *
from .twodim_base import *
from .ufunclike import *
+from .histograms import *
from . import scimath as emath
from .polynomial import *
@@ -43,6 +44,7 @@ __all__ += arraysetops.__all__
__all__ += npyio.__all__
__all__ += financial.__all__
__all__ += nanfunctions.__all__
+__all__ += histograms.__all__
from numpy.testing import _numpy_tester
test = _numpy_tester().test
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 780cf3c5c..e01333b8b 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -39,12 +39,14 @@ if sys.version_info[0] < 3:
else:
import builtins
+# needed in this module for compatibility
+from numpy.lib.histograms import histogram, histogramdd
__all__ = [
'select', 'piecewise', 'trim_zeros', 'copy', 'iterable', 'percentile',
'diff', 'gradient', 'angle', 'unwrap', 'sort_complex', 'disp', 'flip',
'rot90', 'extract', 'place', 'vectorize', 'asarray_chkfinite', 'average',
- 'histogram', 'histogramdd', 'bincount', 'digitize', 'cov', 'corrcoef',
+ 'bincount', 'digitize', 'cov', 'corrcoef',
'msort', 'median', 'sinc', 'hamming', 'hanning', 'bartlett',
'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring',
'meshgrid', 'delete', 'insert', 'append', 'interp', 'add_newdoc_ufunc'
@@ -241,806 +243,6 @@ def iterable(y):
return True
-def _hist_bin_sqrt(x):
- """
- Square root histogram bin estimator.
-
- Bin width is inversely proportional to the data size. Used by many
- programs for its simplicity.
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- return x.ptp() / np.sqrt(x.size)
-
-
-def _hist_bin_sturges(x):
- """
- Sturges histogram bin estimator.
-
- A very simplistic estimator based on the assumption of normality of
- the data. This estimator has poor performance for non-normal data,
- which becomes especially obvious for large data sets. The estimate
- depends only on size of the data.
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- return x.ptp() / (np.log2(x.size) + 1.0)
-
-
-def _hist_bin_rice(x):
- """
- Rice histogram bin estimator.
-
- Another simple estimator with no normality assumption. It has better
- performance for large data than Sturges, but tends to overestimate
- the number of bins. The number of bins is proportional to the cube
- root of data size (asymptotically optimal). The estimate depends
- only on size of the data.
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- return x.ptp() / (2.0 * x.size ** (1.0 / 3))
-
-
-def _hist_bin_scott(x):
- """
- Scott histogram bin estimator.
-
- The binwidth is proportional to the standard deviation of the data
- and inversely proportional to the cube root of data size
- (asymptotically optimal).
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
-
-
-def _hist_bin_doane(x):
- """
- Doane's histogram bin estimator.
-
- Improved version of Sturges' formula which works better for
- non-normal data. See
- stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- if x.size > 2:
- sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
- sigma = np.std(x)
- if sigma > 0.0:
- # These three operations add up to
- # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
- # but use only one temp array instead of three
- temp = x - np.mean(x)
- np.true_divide(temp, sigma, temp)
- np.power(temp, 3, temp)
- g1 = np.mean(temp)
- return x.ptp() / (1.0 + np.log2(x.size) +
- np.log2(1.0 + np.absolute(g1) / sg1))
- return 0.0
-
-
-def _hist_bin_fd(x):
- """
- The Freedman-Diaconis histogram bin estimator.
-
- The Freedman-Diaconis rule uses interquartile range (IQR) to
- estimate binwidth. It is considered a variation of the Scott rule
- with more robustness as the IQR is less affected by outliers than
- the standard deviation. However, the IQR depends on fewer points
- than the standard deviation, so it is less accurate, especially for
- long tailed distributions.
-
- If the IQR is 0, this function returns 1 for the number of bins.
- Binwidth is inversely proportional to the cube root of data size
- (asymptotically optimal).
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- iqr = np.subtract(*np.percentile(x, [75, 25]))
- return 2.0 * iqr * x.size ** (-1.0 / 3.0)
-
-
-def _hist_bin_auto(x):
- """
- Histogram bin estimator that uses the minimum width of the
- Freedman-Diaconis and Sturges estimators.
-
- The FD estimator is usually the most robust method, but its width
- estimate tends to be too large for small `x`. The Sturges estimator
- is quite good for small (<1000) datasets and is the default in the R
- language. This method gives good off the shelf behaviour.
-
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
-
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
-
- See Also
- --------
- _hist_bin_fd, _hist_bin_sturges
- """
- # There is no need to check for zero here. If ptp is, so is IQR and
- # vice versa. Either both are zero or neither one is.
- return min(_hist_bin_fd(x), _hist_bin_sturges(x))
-
-
-# Private dict initialized at module load time
-_hist_bin_selectors = {'auto': _hist_bin_auto,
- 'doane': _hist_bin_doane,
- 'fd': _hist_bin_fd,
- 'rice': _hist_bin_rice,
- 'scott': _hist_bin_scott,
- 'sqrt': _hist_bin_sqrt,
- 'sturges': _hist_bin_sturges}
-
-
-def histogram(a, bins=10, range=None, normed=False, weights=None,
- density=None):
- r"""
- Compute the histogram of a set of data.
-
- Parameters
- ----------
- a : array_like
- Input data. The histogram is computed over the flattened array.
- bins : int or sequence of scalars or str, optional
- If `bins` is an int, it defines the number of equal-width
- bins in the given range (10, by default). If `bins` is a
- sequence, it defines the bin edges, including the rightmost
- edge, allowing for non-uniform bin widths.
-
- .. versionadded:: 1.11.0
-
- If `bins` is a string from the list below, `histogram` will use
- the method chosen to calculate the optimal bin width and
- consequently the number of bins (see `Notes` for more detail on
- the estimators) from the data that falls within the requested
- range. While the bin width will be optimal for the actual data
- in the range, the number of bins will be computed to fill the
- entire range, including the empty portions. For visualisation,
- using the 'auto' option is suggested. Weighted data is not
- supported for automated bin size selection.
-
- 'auto'
- Maximum of the 'sturges' and 'fd' estimators. Provides good
- all around performance.
-
- 'fd' (Freedman Diaconis Estimator)
- Robust (resilient to outliers) estimator that takes into
- account data variability and data size.
-
- 'doane'
- An improved version of Sturges' estimator that works better
- with non-normal datasets.
-
- 'scott'
- Less robust estimator that that takes into account data
- variability and data size.
-
- 'rice'
- Estimator does not take variability into account, only data
- size. Commonly overestimates number of bins required.
-
- 'sturges'
- R's default method, only accounts for data size. Only
- optimal for gaussian data and underestimates number of bins
- for large non-gaussian datasets.
-
- 'sqrt'
- Square root (of data size) estimator, used by Excel and
- other programs for its speed and simplicity.
-
- range : (float, float), optional
- The lower and upper range of the bins. If not provided, range
- is simply ``(a.min(), a.max())``. Values outside the range are
- ignored. The first element of the range must be less than or
- equal to the second. `range` affects the automatic bin
- computation as well. While bin width is computed to be optimal
- based on the actual data within `range`, the bin count will fill
- the entire range including portions containing no data.
- normed : bool, optional
- This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy
- behavior. It will be removed in NumPy 2.0.0. Use the ``density``
- keyword instead. If ``False``, the result will contain the
- number of samples in each bin. If ``True``, the result is the
- value of the probability *density* function at the bin,
- normalized such that the *integral* over the range is 1. Note
- that this latter behavior is known to be buggy with unequal bin
- widths; use ``density`` instead.
- weights : array_like, optional
- An array of weights, of the same shape as `a`. Each value in
- `a` only contributes its associated weight towards the bin count
- (instead of 1). If `density` is True, the weights are
- normalized, so that the integral of the density over the range
- remains 1.
- density : bool, optional
- If ``False``, the result will contain the number of samples in
- each bin. If ``True``, the result is the value of the
- probability *density* function at the bin, normalized such that
- the *integral* over the range is 1. Note that the sum of the
- histogram values will not be equal to 1 unless bins of unity
- width are chosen; it is not a probability *mass* function.
-
- Overrides the ``normed`` keyword if given.
-
- Returns
- -------
- hist : array
- The values of the histogram. See `density` and `weights` for a
- description of the possible semantics.
- bin_edges : array of dtype float
- Return the bin edges ``(length(hist)+1)``.
-
-
- See Also
- --------
- histogramdd, bincount, searchsorted, digitize
-
- Notes
- -----
- All but the last (righthand-most) bin is half-open. In other words,
- if `bins` is::
-
- [1, 2, 3, 4]
-
- then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
- the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
- *includes* 4.
-
- .. versionadded:: 1.11.0
-
- The methods to estimate the optimal number of bins are well founded
- in literature, and are inspired by the choices R provides for
- histogram visualisation. Note that having the number of bins
- proportional to :math:`n^{1/3}` is asymptotically optimal, which is
- why it appears in most estimators. These are simply plug-in methods
- that give good starting points for number of bins. In the equations
- below, :math:`h` is the binwidth and :math:`n_h` is the number of
- bins. All estimators that compute bin counts are recast to bin width
- using the `ptp` of the data. The final bin count is obtained from
- ``np.round(np.ceil(range / h))`.
-
- 'Auto' (maximum of the 'Sturges' and 'FD' estimators)
- A compromise to get a good value. For small datasets the Sturges
- value will usually be chosen, while larger datasets will usually
- default to FD. Avoids the overly conservative behaviour of FD
- and Sturges for small and large datasets respectively.
- Switchover point is usually :math:`a.size \approx 1000`.
-
- 'FD' (Freedman Diaconis Estimator)
- .. math:: h = 2 \frac{IQR}{n^{1/3}}
-
- The binwidth is proportional to the interquartile range (IQR)
- and inversely proportional to cube root of a.size. Can be too
- conservative for small datasets, but is quite good for large
- datasets. The IQR is very robust to outliers.
-
- 'Scott'
- .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}}
-
- The binwidth is proportional to the standard deviation of the
- data and inversely proportional to cube root of ``x.size``. Can
- be too conservative for small datasets, but is quite good for
- large datasets. The standard deviation is not very robust to
- outliers. Values are very similar to the Freedman-Diaconis
- estimator in the absence of outliers.
-
- 'Rice'
- .. math:: n_h = 2n^{1/3}
-
- The number of bins is only proportional to cube root of
- ``a.size``. It tends to overestimate the number of bins and it
- does not take into account data variability.
-
- 'Sturges'
- .. math:: n_h = \log _{2}n+1
-
- The number of bins is the base 2 log of ``a.size``. This
- estimator assumes normality of data and is too conservative for
- larger, non-normal datasets. This is the default method in R's
- ``hist`` method.
-
- 'Doane'
- .. math:: n_h = 1 + \log_{2}(n) +
- \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}})
-
- g_1 = mean[(\frac{x - \mu}{\sigma})^3]
-
- \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
-
- An improved version of Sturges' formula that produces better
- estimates for non-normal datasets. This estimator attempts to
- account for the skew of the data.
-
- 'Sqrt'
- .. math:: n_h = \sqrt n
- The simplest and fastest estimator. Only takes into account the
- data size.
-
- Examples
- --------
- >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
- (array([0, 2, 1]), array([0, 1, 2, 3]))
- >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
- (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
- >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
- (array([1, 4, 1]), array([0, 1, 2, 3]))
-
- >>> a = np.arange(5)
- >>> hist, bin_edges = np.histogram(a, density=True)
- >>> hist
- array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
- >>> hist.sum()
- 2.4999999999999996
- >>> np.sum(hist * np.diff(bin_edges))
- 1.0
-
- .. versionadded:: 1.11.0
-
- Automated Bin Selection Methods example, using 2 peak random data
- with 2000 points:
-
- >>> import matplotlib.pyplot as plt
- >>> rng = np.random.RandomState(10) # deterministic random data
- >>> a = np.hstack((rng.normal(size=1000),
- ... rng.normal(loc=5, scale=2, size=1000)))
- >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram
- >>> plt.title("Histogram with 'auto' bins")
- >>> plt.show()
-
- """
- a = asarray(a)
- if weights is not None:
- weights = asarray(weights)
- if weights.shape != a.shape:
- raise ValueError(
- 'weights should have the same shape as a.')
- weights = weights.ravel()
- a = a.ravel()
-
- # Do not modify the original value of range so we can check for `None`
- if range is None:
- if a.size == 0:
- # handle empty arrays. Can't determine range, so use 0-1.
- first_edge, last_edge = 0.0, 1.0
- else:
- first_edge, last_edge = a.min() + 0.0, a.max() + 0.0
- else:
- first_edge, last_edge = [mi + 0.0 for mi in range]
- if first_edge > last_edge:
- raise ValueError(
- 'max must be larger than min in range parameter.')
- if not np.all(np.isfinite([first_edge, last_edge])):
- raise ValueError(
- 'range parameter must be finite.')
- if first_edge == last_edge:
- first_edge -= 0.5
- last_edge += 0.5
-
- # density overrides the normed keyword
- if density is not None:
- normed = False
-
- # parse the overloaded bins argument
- n_equal_bins = None
- bin_edges = None
-
- if isinstance(bins, basestring):
- bin_name = bins
- # if `bins` is a string for an automatic method,
- # this will replace it with the number of bins calculated
- if bin_name not in _hist_bin_selectors:
- raise ValueError(
- "{!r} is not a valid estimator for `bins`".format(bin_name))
- if weights is not None:
- raise TypeError("Automated estimation of the number of "
- "bins is not supported for weighted data")
- # Make a reference to `a`
- b = a
- # Update the reference if the range needs truncation
- if range is not None:
- keep = (a >= first_edge)
- keep &= (a <= last_edge)
- if not np.logical_and.reduce(keep):
- b = a[keep]
-
- if b.size == 0:
- n_equal_bins = 1
- else:
- # Do not call selectors on empty arrays
- width = _hist_bin_selectors[bin_name](b)
- if width:
- n_equal_bins = int(np.ceil((last_edge - first_edge) / width))
- else:
- # Width can be zero for some estimators, e.g. FD when
- # the IQR of the data is zero.
- n_equal_bins = 1
-
- elif np.ndim(bins) == 0:
- try:
- n_equal_bins = operator.index(bins)
- except TypeError:
- raise TypeError(
- '`bins` must be an integer, a string, or an array')
- if n_equal_bins < 1:
- raise ValueError('`bins` must be positive, when an integer')
-
- elif np.ndim(bins) == 1:
- bin_edges = np.asarray(bins)
- if np.any(bin_edges[:-1] > bin_edges[1:]):
- raise ValueError(
- '`bins` must increase monotonically, when an array')
-
- else:
- raise ValueError('`bins` must be 1d, when an array')
-
- del bins
-
- # compute the bins if only the count was specified
- if n_equal_bins is not None:
- bin_edges = linspace(
- first_edge, last_edge, n_equal_bins + 1, endpoint=True)
-
- # Histogram is an integer or a float array depending on the weights.
- if weights is None:
- ntype = np.dtype(np.intp)
- else:
- ntype = weights.dtype
-
- # We set a block size, as this allows us to iterate over chunks when
- # computing histograms, to minimize memory usage.
- BLOCK = 65536
-
- # The fast path uses bincount, but that only works for certain types
- # of weight
- simple_weights = (
- weights is None or
- np.can_cast(weights.dtype, np.double) or
- np.can_cast(weights.dtype, complex)
- )
-
- if n_equal_bins is not None and simple_weights:
- # Fast algorithm for equal bins
- # We now convert values of a to bin indices, under the assumption of
- # equal bin widths (which is valid here).
-
- # Initialize empty histogram
- n = np.zeros(n_equal_bins, ntype)
-
- # Pre-compute histogram scaling factor
- norm = n_equal_bins / (last_edge - first_edge)
-
- # We iterate over blocks here for two reasons: the first is that for
- # large arrays, it is actually faster (for example for a 10^8 array it
- # is 2x as fast) and it results in a memory footprint 3x lower in the
- # limit of large arrays.
- for i in arange(0, len(a), BLOCK):
- tmp_a = a[i:i+BLOCK]
- if weights is None:
- tmp_w = None
- else:
- tmp_w = weights[i:i + BLOCK]
-
- # Only include values in the right range
- keep = (tmp_a >= first_edge)
- keep &= (tmp_a <= last_edge)
- if not np.logical_and.reduce(keep):
- tmp_a = tmp_a[keep]
- if tmp_w is not None:
- tmp_w = tmp_w[keep]
- tmp_a_data = tmp_a.astype(float)
- tmp_a = tmp_a_data - first_edge
- tmp_a *= norm
-
- # Compute the bin indices, and for values that lie exactly on
- # last_edge we need to subtract one
- indices = tmp_a.astype(np.intp)
- indices[indices == n_equal_bins] -= 1
-
- # The index computation is not guaranteed to give exactly
- # consistent results within ~1 ULP of the bin edges.
- decrement = tmp_a_data < bin_edges[indices]
- indices[decrement] -= 1
- # The last bin includes the right edge. The other bins do not.
- increment = ((tmp_a_data >= bin_edges[indices + 1])
- & (indices != n_equal_bins - 1))
- indices[increment] += 1
-
- # We now compute the histogram using bincount
- if ntype.kind == 'c':
- n.real += np.bincount(indices, weights=tmp_w.real,
- minlength=n_equal_bins)
- n.imag += np.bincount(indices, weights=tmp_w.imag,
- minlength=n_equal_bins)
- else:
- n += np.bincount(indices, weights=tmp_w,
- minlength=n_equal_bins).astype(ntype)
- else:
- # Compute via cumulative histogram
- cum_n = np.zeros(bin_edges.shape, ntype)
- if weights is None:
- for i in arange(0, len(a), BLOCK):
- sa = sort(a[i:i+BLOCK])
- cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
- sa.searchsorted(bin_edges[-1], 'right')]
- else:
- zero = array(0, dtype=ntype)
- for i in arange(0, len(a), BLOCK):
- tmp_a = a[i:i+BLOCK]
- tmp_w = weights[i:i+BLOCK]
- sorting_index = np.argsort(tmp_a)
- sa = tmp_a[sorting_index]
- sw = tmp_w[sorting_index]
- cw = np.concatenate(([zero], sw.cumsum()))
- bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
- sa.searchsorted(bin_edges[-1], 'right')]
- cum_n += cw[bin_index]
-
- n = np.diff(cum_n)
-
- if density:
- db = array(np.diff(bin_edges), float)
- return n/db/n.sum(), bin_edges
- elif normed:
- # deprecated, buggy behavior. Remove for NumPy 2.0.0
- db = array(np.diff(bin_edges), float)
- return n/(n*db).sum(), bin_edges
- else:
- return n, bin_edges
-
-
-def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
- """
- Compute the multidimensional histogram of some data.
-
- Parameters
- ----------
- sample : array_like
- The data to be histogrammed. It must be an (N,D) array or data
- that can be converted to such. The rows of the resulting array
- are the coordinates of points in a D dimensional polytope.
- bins : sequence or int, optional
- The bin specification:
-
- * A sequence of arrays describing the bin edges along each dimension.
- * The number of bins for each dimension (nx, ny, ... =bins)
- * The number of bins for all dimensions (nx=ny=...=bins).
-
- range : sequence, optional
- A sequence of lower and upper bin edges to be used if the edges are
- not given explicitly in `bins`. Defaults to the minimum and maximum
- values along each dimension.
- normed : bool, optional
- If False, returns the number of samples in each bin. If True,
- returns the bin density ``bin_count / sample_count / bin_volume``.
- weights : (N,) array_like, optional
- An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
- Weights are normalized to 1 if normed is True. If normed is False,
- the values of the returned histogram are equal to the sum of the
- weights belonging to the samples falling into each bin.
-
- Returns
- -------
- H : ndarray
- The multidimensional histogram of sample x. See normed and weights
- for the different possible semantics.
- edges : list
- A list of D arrays describing the bin edges for each dimension.
-
- See Also
- --------
- histogram: 1-D histogram
- histogram2d: 2-D histogram
-
- Examples
- --------
- >>> r = np.random.randn(100,3)
- >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
- >>> H.shape, edges[0].size, edges[1].size, edges[2].size
- ((5, 8, 4), 6, 9, 5)
-
- """
-
- try:
- # Sample is an ND-array.
- N, D = sample.shape
- except (AttributeError, ValueError):
- # Sample is a sequence of 1D arrays.
- sample = atleast_2d(sample).T
- N, D = sample.shape
-
- nbin = empty(D, int)
- edges = D*[None]
- dedges = D*[None]
- if weights is not None:
- weights = asarray(weights)
-
- try:
- M = len(bins)
- if M != D:
- raise ValueError(
- 'The dimension of bins must be equal to the dimension of the '
- ' sample x.')
- except TypeError:
- # bins is an integer
- bins = D*[bins]
-
- # Select range for each dimension
- # Used only if number of bins is given.
- if range is None:
- # Handle empty input. Range can't be determined in that case, use 0-1.
- if N == 0:
- smin = zeros(D)
- smax = ones(D)
- else:
- smin = atleast_1d(array(sample.min(0), float))
- smax = atleast_1d(array(sample.max(0), float))
- else:
- if not np.all(np.isfinite(range)):
- raise ValueError(
- 'range parameter must be finite.')
- smin = zeros(D)
- smax = zeros(D)
- for i in arange(D):
- smin[i], smax[i] = range[i]
-
- # Make sure the bins have a finite width.
- for i in arange(len(smin)):
- if smin[i] == smax[i]:
- smin[i] = smin[i] - .5
- smax[i] = smax[i] + .5
-
- # avoid rounding issues for comparisons when dealing with inexact types
- if np.issubdtype(sample.dtype, np.inexact):
- edge_dt = sample.dtype
- else:
- edge_dt = float
- # Create edge arrays
- for i in arange(D):
- if isscalar(bins[i]):
- if bins[i] < 1:
- raise ValueError(
- "Element at index %s in `bins` should be a positive "
- "integer." % i)
- nbin[i] = bins[i] + 2 # +2 for outlier bins
- edges[i] = linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt)
- else:
- edges[i] = asarray(bins[i], edge_dt)
- nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
- dedges[i] = diff(edges[i])
- if np.any(np.asarray(dedges[i]) <= 0):
- raise ValueError(
- "Found bin edge of size <= 0. Did you specify `bins` with"
- "non-monotonic sequence?")
-
- nbin = asarray(nbin)
-
- # Handle empty input.
- if N == 0:
- return np.zeros(nbin-2), edges
-
- # Compute the bin number each sample falls into.
- Ncount = {}
- for i in arange(D):
- Ncount[i] = digitize(sample[:, i], edges[i])
-
- # Using digitize, values that fall on an edge are put in the right bin.
- # For the rightmost bin, we want values equal to the right edge to be
- # counted in the last bin, and not as an outlier.
- for i in arange(D):
- # Rounding precision
- mindiff = dedges[i].min()
- if not np.isinf(mindiff):
- decimal = int(-log10(mindiff)) + 6
- # Find which points are on the rightmost edge.
- not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
- on_edge = (around(sample[:, i], decimal) ==
- around(edges[i][-1], decimal))
- # Shift these points one bin to the left.
- Ncount[i][nonzero(on_edge & not_smaller_than_edge)[0]] -= 1
-
- # Flattened histogram matrix (1D)
- # Reshape is used so that overlarge arrays
- # will raise an error.
- hist = zeros(nbin, float).reshape(-1)
-
- # Compute the sample indices in the flattened histogram matrix.
- ni = nbin.argsort()
- xy = zeros(N, int)
- for i in arange(0, D-1):
- xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod()
- xy += Ncount[ni[-1]]
-
- # Compute the number of repetitions in xy and assign it to the
- # flattened histmat.
- if len(xy) == 0:
- return zeros(nbin-2, int), edges
-
- flatcount = bincount(xy, weights)
- a = arange(len(flatcount))
- hist[a] = flatcount
-
- # Shape into a proper matrix
- hist = hist.reshape(sort(nbin))
- for i in arange(nbin.size):
- j = ni.argsort()[i]
- hist = hist.swapaxes(i, j)
- ni[i], ni[j] = ni[j], ni[i]
-
- # Remove outliers (indices 0 and -1 for each dimension).
- core = D*[slice(1, -1)]
- hist = hist[core]
-
- # Normalize if normed is True
- if normed:
- s = hist.sum()
- for i in arange(D):
- shape = ones(D, int)
- shape[i] = nbin[i] - 2
- hist = hist / dedges[i].reshape(shape)
- hist /= s
-
- if (hist.shape != nbin - 2).any():
- raise RuntimeError(
- "Internal Shape Error")
- return hist, edges
-
-
def average(a, axis=None, weights=None, returned=False):
"""
Compute the weighted average along the specified axis.
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
new file mode 100644
index 000000000..61bbab6eb
--- /dev/null
+++ b/numpy/lib/histograms.py
@@ -0,0 +1,811 @@
+"""
+Histogram-related functions
+"""
+from __future__ import division, absolute_import, print_function
+
+import operator
+
+import numpy as np
+from numpy.compat.py3k import basestring
+
+__all__ = ['histogram', 'histogramdd']
+
+
+def _hist_bin_sqrt(x):
+ """
+ Square root histogram bin estimator.
+
+ Bin width is inversely proportional to the data size. Used by many
+ programs for its simplicity.
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+ """
+ return x.ptp() / np.sqrt(x.size)
+
+
+def _hist_bin_sturges(x):
+ """
+ Sturges histogram bin estimator.
+
+ A very simplistic estimator based on the assumption of normality of
+ the data. This estimator has poor performance for non-normal data,
+ which becomes especially obvious for large data sets. The estimate
+ depends only on size of the data.
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+ """
+ return x.ptp() / (np.log2(x.size) + 1.0)
+
+
+def _hist_bin_rice(x):
+ """
+ Rice histogram bin estimator.
+
+ Another simple estimator with no normality assumption. It has better
+ performance for large data than Sturges, but tends to overestimate
+ the number of bins. The number of bins is proportional to the cube
+ root of data size (asymptotically optimal). The estimate depends
+ only on size of the data.
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+ """
+ return x.ptp() / (2.0 * x.size ** (1.0 / 3))
+
+
+def _hist_bin_scott(x):
+ """
+ Scott histogram bin estimator.
+
+ The binwidth is proportional to the standard deviation of the data
+ and inversely proportional to the cube root of data size
+ (asymptotically optimal).
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+ """
+ return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
+
+
+def _hist_bin_doane(x):
+ """
+ Doane's histogram bin estimator.
+
+ Improved version of Sturges' formula which works better for
+ non-normal data. See
+ stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+ """
+ if x.size > 2:
+ sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
+ sigma = np.std(x)
+ if sigma > 0.0:
+ # These three operations add up to
+ # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
+ # but use only one temp array instead of three
+ temp = x - np.mean(x)
+ np.true_divide(temp, sigma, temp)
+ np.power(temp, 3, temp)
+ g1 = np.mean(temp)
+ return x.ptp() / (1.0 + np.log2(x.size) +
+ np.log2(1.0 + np.absolute(g1) / sg1))
+ return 0.0
+
+
+def _hist_bin_fd(x):
+ """
+ The Freedman-Diaconis histogram bin estimator.
+
+ The Freedman-Diaconis rule uses interquartile range (IQR) to
+ estimate binwidth. It is considered a variation of the Scott rule
+ with more robustness as the IQR is less affected by outliers than
+ the standard deviation. However, the IQR depends on fewer points
+ than the standard deviation, so it is less accurate, especially for
+ long tailed distributions.
+
+ If the IQR is 0, this function returns 1 for the number of bins.
+ Binwidth is inversely proportional to the cube root of data size
+ (asymptotically optimal).
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+ """
+ iqr = np.subtract(*np.percentile(x, [75, 25]))
+ return 2.0 * iqr * x.size ** (-1.0 / 3.0)
+
+
+def _hist_bin_auto(x):
+ """
+ Histogram bin estimator that uses the minimum width of the
+ Freedman-Diaconis and Sturges estimators.
+
+ The FD estimator is usually the most robust method, but its width
+ estimate tends to be too large for small `x`. The Sturges estimator
+ is quite good for small (<1000) datasets and is the default in the R
+ language. This method gives good off the shelf behaviour.
+
+ Parameters
+ ----------
+ x : array_like
+ Input data that is to be histogrammed, trimmed to range. May not
+ be empty.
+
+ Returns
+ -------
+ h : An estimate of the optimal bin width for the given data.
+
+ See Also
+ --------
+ _hist_bin_fd, _hist_bin_sturges
+ """
+ # There is no need to check for zero here. If ptp is, so is IQR and
+ # vice versa. Either both are zero or neither one is.
+ return min(_hist_bin_fd(x), _hist_bin_sturges(x))
+
+
+# Private dict initialized at module load time
+_hist_bin_selectors = {'auto': _hist_bin_auto,
+ 'doane': _hist_bin_doane,
+ 'fd': _hist_bin_fd,
+ 'rice': _hist_bin_rice,
+ 'scott': _hist_bin_scott,
+ 'sqrt': _hist_bin_sqrt,
+ 'sturges': _hist_bin_sturges}
+
+
+def histogram(a, bins=10, range=None, normed=False, weights=None,
+ density=None):
+ r"""
+ Compute the histogram of a set of data.
+
+ Parameters
+ ----------
+ a : array_like
+ Input data. The histogram is computed over the flattened array.
+ bins : int or sequence of scalars or str, optional
+ If `bins` is an int, it defines the number of equal-width
+ bins in the given range (10, by default). If `bins` is a
+ sequence, it defines the bin edges, including the rightmost
+ edge, allowing for non-uniform bin widths.
+
+ .. versionadded:: 1.11.0
+
+ If `bins` is a string from the list below, `histogram` will use
+ the method chosen to calculate the optimal bin width and
+ consequently the number of bins (see `Notes` for more detail on
+ the estimators) from the data that falls within the requested
+ range. While the bin width will be optimal for the actual data
+ in the range, the number of bins will be computed to fill the
+ entire range, including the empty portions. For visualisation,
+ using the 'auto' option is suggested. Weighted data is not
+ supported for automated bin size selection.
+
+ 'auto'
+ Maximum of the 'sturges' and 'fd' estimators. Provides good
+ all around performance.
+
+ 'fd' (Freedman Diaconis Estimator)
+ Robust (resilient to outliers) estimator that takes into
+ account data variability and data size.
+
+ 'doane'
+ An improved version of Sturges' estimator that works better
+ with non-normal datasets.
+
+ 'scott'
+ Less robust estimator that that takes into account data
+ variability and data size.
+
+ 'rice'
+ Estimator does not take variability into account, only data
+ size. Commonly overestimates number of bins required.
+
+ 'sturges'
+ R's default method, only accounts for data size. Only
+ optimal for gaussian data and underestimates number of bins
+ for large non-gaussian datasets.
+
+ 'sqrt'
+ Square root (of data size) estimator, used by Excel and
+ other programs for its speed and simplicity.
+
+ range : (float, float), optional
+ The lower and upper range of the bins. If not provided, range
+ is simply ``(a.min(), a.max())``. Values outside the range are
+ ignored. The first element of the range must be less than or
+ equal to the second. `range` affects the automatic bin
+ computation as well. While bin width is computed to be optimal
+ based on the actual data within `range`, the bin count will fill
+ the entire range including portions containing no data.
+ normed : bool, optional
+ This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy
+ behavior. It will be removed in NumPy 2.0.0. Use the ``density``
+ keyword instead. If ``False``, the result will contain the
+ number of samples in each bin. If ``True``, the result is the
+ value of the probability *density* function at the bin,
+ normalized such that the *integral* over the range is 1. Note
+ that this latter behavior is known to be buggy with unequal bin
+ widths; use ``density`` instead.
+ weights : array_like, optional
+ An array of weights, of the same shape as `a`. Each value in
+ `a` only contributes its associated weight towards the bin count
+ (instead of 1). If `density` is True, the weights are
+ normalized, so that the integral of the density over the range
+ remains 1.
+ density : bool, optional
+ If ``False``, the result will contain the number of samples in
+ each bin. If ``True``, the result is the value of the
+ probability *density* function at the bin, normalized such that
+ the *integral* over the range is 1. Note that the sum of the
+ histogram values will not be equal to 1 unless bins of unity
+ width are chosen; it is not a probability *mass* function.
+
+ Overrides the ``normed`` keyword if given.
+
+ Returns
+ -------
+ hist : array
+ The values of the histogram. See `density` and `weights` for a
+ description of the possible semantics.
+ bin_edges : array of dtype float
+ Return the bin edges ``(length(hist)+1)``.
+
+
+ See Also
+ --------
+ histogramdd, bincount, searchsorted, digitize
+
+ Notes
+ -----
+ All but the last (righthand-most) bin is half-open. In other words,
+ if `bins` is::
+
+ [1, 2, 3, 4]
+
+ then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
+ the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
+ *includes* 4.
+
+ .. versionadded:: 1.11.0
+
+ The methods to estimate the optimal number of bins are well founded
+ in literature, and are inspired by the choices R provides for
+ histogram visualisation. Note that having the number of bins
+ proportional to :math:`n^{1/3}` is asymptotically optimal, which is
+ why it appears in most estimators. These are simply plug-in methods
+ that give good starting points for number of bins. In the equations
+ below, :math:`h` is the binwidth and :math:`n_h` is the number of
+ bins. All estimators that compute bin counts are recast to bin width
+ using the `ptp` of the data. The final bin count is obtained from
+ ``np.round(np.ceil(range / h))`.
+
+ 'Auto' (maximum of the 'Sturges' and 'FD' estimators)
+ A compromise to get a good value. For small datasets the Sturges
+ value will usually be chosen, while larger datasets will usually
+ default to FD. Avoids the overly conservative behaviour of FD
+ and Sturges for small and large datasets respectively.
+ Switchover point is usually :math:`a.size \approx 1000`.
+
+ 'FD' (Freedman Diaconis Estimator)
+ .. math:: h = 2 \frac{IQR}{n^{1/3}}
+
+ The binwidth is proportional to the interquartile range (IQR)
+ and inversely proportional to cube root of a.size. Can be too
+ conservative for small datasets, but is quite good for large
+ datasets. The IQR is very robust to outliers.
+
+ 'Scott'
+ .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}}
+
+ The binwidth is proportional to the standard deviation of the
+ data and inversely proportional to cube root of ``x.size``. Can
+ be too conservative for small datasets, but is quite good for
+ large datasets. The standard deviation is not very robust to
+ outliers. Values are very similar to the Freedman-Diaconis
+ estimator in the absence of outliers.
+
+ 'Rice'
+ .. math:: n_h = 2n^{1/3}
+
+ The number of bins is only proportional to cube root of
+ ``a.size``. It tends to overestimate the number of bins and it
+ does not take into account data variability.
+
+ 'Sturges'
+ .. math:: n_h = \log _{2}n+1
+
+ The number of bins is the base 2 log of ``a.size``. This
+ estimator assumes normality of data and is too conservative for
+ larger, non-normal datasets. This is the default method in R's
+ ``hist`` method.
+
+ 'Doane'
+ .. math:: n_h = 1 + \log_{2}(n) +
+ \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}})
+
+ g_1 = mean[(\frac{x - \mu}{\sigma})^3]
+
+ \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
+
+ An improved version of Sturges' formula that produces better
+ estimates for non-normal datasets. This estimator attempts to
+ account for the skew of the data.
+
+ 'Sqrt'
+ .. math:: n_h = \sqrt n
+ The simplest and fastest estimator. Only takes into account the
+ data size.
+
+ Examples
+ --------
+ >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
+ (array([0, 2, 1]), array([0, 1, 2, 3]))
+ >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
+ (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
+ >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
+ (array([1, 4, 1]), array([0, 1, 2, 3]))
+
+ >>> a = np.arange(5)
+ >>> hist, bin_edges = np.histogram(a, density=True)
+ >>> hist
+ array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
+ >>> hist.sum()
+ 2.4999999999999996
+ >>> np.sum(hist * np.diff(bin_edges))
+ 1.0
+
+ .. versionadded:: 1.11.0
+
+ Automated Bin Selection Methods example, using 2 peak random data
+ with 2000 points:
+
+ >>> import matplotlib.pyplot as plt
+ >>> rng = np.random.RandomState(10) # deterministic random data
+ >>> a = np.hstack((rng.normal(size=1000),
+ ... rng.normal(loc=5, scale=2, size=1000)))
+ >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram
+ >>> plt.title("Histogram with 'auto' bins")
+ >>> plt.show()
+
+ """
+ a = np.asarray(a)
+ if weights is not None:
+ weights = np.asarray(weights)
+ if weights.shape != a.shape:
+ raise ValueError(
+ 'weights should have the same shape as a.')
+ weights = weights.ravel()
+ a = a.ravel()
+
+ # Do not modify the original value of range so we can check for `None`
+ if range is None:
+ if a.size == 0:
+ # handle empty arrays. Can't determine range, so use 0-1.
+ first_edge, last_edge = 0.0, 1.0
+ else:
+ first_edge, last_edge = a.min() + 0.0, a.max() + 0.0
+ else:
+ first_edge, last_edge = [mi + 0.0 for mi in range]
+ if first_edge > last_edge:
+ raise ValueError(
+ 'max must be larger than min in range parameter.')
+ if not np.all(np.isfinite([first_edge, last_edge])):
+ raise ValueError(
+ 'range parameter must be finite.')
+ if first_edge == last_edge:
+ first_edge -= 0.5
+ last_edge += 0.5
+
+ # density overrides the normed keyword
+ if density is not None:
+ normed = False
+
+ # parse the overloaded bins argument
+ n_equal_bins = None
+ bin_edges = None
+
+ if isinstance(bins, basestring):
+ bin_name = bins
+ # if `bins` is a string for an automatic method,
+ # this will replace it with the number of bins calculated
+ if bin_name not in _hist_bin_selectors:
+ raise ValueError(
+ "{!r} is not a valid estimator for `bins`".format(bin_name))
+ if weights is not None:
+ raise TypeError("Automated estimation of the number of "
+ "bins is not supported for weighted data")
+ # Make a reference to `a`
+ b = a
+ # Update the reference if the range needs truncation
+ if range is not None:
+ keep = (a >= first_edge)
+ keep &= (a <= last_edge)
+ if not np.logical_and.reduce(keep):
+ b = a[keep]
+
+ if b.size == 0:
+ n_equal_bins = 1
+ else:
+ # Do not call selectors on empty arrays
+ width = _hist_bin_selectors[bin_name](b)
+ if width:
+ n_equal_bins = int(np.ceil((last_edge - first_edge) / width))
+ else:
+ # Width can be zero for some estimators, e.g. FD when
+ # the IQR of the data is zero.
+ n_equal_bins = 1
+
+ elif np.ndim(bins) == 0:
+ try:
+ n_equal_bins = operator.index(bins)
+ except TypeError:
+ raise TypeError(
+ '`bins` must be an integer, a string, or an array')
+ if n_equal_bins < 1:
+ raise ValueError('`bins` must be positive, when an integer')
+
+ elif np.ndim(bins) == 1:
+ bin_edges = np.asarray(bins)
+ if np.any(bin_edges[:-1] > bin_edges[1:]):
+ raise ValueError(
+ '`bins` must increase monotonically, when an array')
+
+ else:
+ raise ValueError('`bins` must be 1d, when an array')
+
+ del bins
+
+ # compute the bins if only the count was specified
+ if n_equal_bins is not None:
+ bin_edges = np.linspace(
+ first_edge, last_edge, n_equal_bins + 1, endpoint=True)
+
+ # Histogram is an integer or a float array depending on the weights.
+ if weights is None:
+ ntype = np.dtype(np.intp)
+ else:
+ ntype = weights.dtype
+
+ # We set a block size, as this allows us to iterate over chunks when
+ # computing histograms, to minimize memory usage.
+ BLOCK = 65536
+
+ # The fast path uses bincount, but that only works for certain types
+ # of weight
+ simple_weights = (
+ weights is None or
+ np.can_cast(weights.dtype, np.double) or
+ np.can_cast(weights.dtype, complex)
+ )
+
+ if n_equal_bins is not None and simple_weights:
+ # Fast algorithm for equal bins
+ # We now convert values of a to bin indices, under the assumption of
+ # equal bin widths (which is valid here).
+
+ # Initialize empty histogram
+ n = np.zeros(n_equal_bins, ntype)
+
+ # Pre-compute histogram scaling factor
+ norm = n_equal_bins / (last_edge - first_edge)
+
+ # We iterate over blocks here for two reasons: the first is that for
+ # large arrays, it is actually faster (for example for a 10^8 array it
+ # is 2x as fast) and it results in a memory footprint 3x lower in the
+ # limit of large arrays.
+ for i in np.arange(0, len(a), BLOCK):
+ tmp_a = a[i:i+BLOCK]
+ if weights is None:
+ tmp_w = None
+ else:
+ tmp_w = weights[i:i + BLOCK]
+
+ # Only include values in the right range
+ keep = (tmp_a >= first_edge)
+ keep &= (tmp_a <= last_edge)
+ if not np.logical_and.reduce(keep):
+ tmp_a = tmp_a[keep]
+ if tmp_w is not None:
+ tmp_w = tmp_w[keep]
+ tmp_a_data = tmp_a.astype(float)
+ tmp_a = tmp_a_data - first_edge
+ tmp_a *= norm
+
+ # Compute the bin indices, and for values that lie exactly on
+ # last_edge we need to subtract one
+ indices = tmp_a.astype(np.intp)
+ indices[indices == n_equal_bins] -= 1
+
+ # The index computation is not guaranteed to give exactly
+ # consistent results within ~1 ULP of the bin edges.
+ decrement = tmp_a_data < bin_edges[indices]
+ indices[decrement] -= 1
+ # The last bin includes the right edge. The other bins do not.
+ increment = ((tmp_a_data >= bin_edges[indices + 1])
+ & (indices != n_equal_bins - 1))
+ indices[increment] += 1
+
+ # We now compute the histogram using bincount
+ if ntype.kind == 'c':
+ n.real += np.bincount(indices, weights=tmp_w.real,
+ minlength=n_equal_bins)
+ n.imag += np.bincount(indices, weights=tmp_w.imag,
+ minlength=n_equal_bins)
+ else:
+ n += np.bincount(indices, weights=tmp_w,
+ minlength=n_equal_bins).astype(ntype)
+ else:
+ # Compute via cumulative histogram
+ cum_n = np.zeros(bin_edges.shape, ntype)
+ if weights is None:
+ for i in np.arange(0, len(a), BLOCK):
+ sa = np.sort(a[i:i+BLOCK])
+ cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+ sa.searchsorted(bin_edges[-1], 'right')]
+ else:
+ zero = np.array(0, dtype=ntype)
+ for i in np.arange(0, len(a), BLOCK):
+ tmp_a = a[i:i+BLOCK]
+ tmp_w = weights[i:i+BLOCK]
+ sorting_index = np.argsort(tmp_a)
+ sa = tmp_a[sorting_index]
+ sw = tmp_w[sorting_index]
+ cw = np.concatenate(([zero], sw.cumsum()))
+ bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+ sa.searchsorted(bin_edges[-1], 'right')]
+ cum_n += cw[bin_index]
+
+ n = np.diff(cum_n)
+
+ if density:
+ db = np.array(np.diff(bin_edges), float)
+ return n/db/n.sum(), bin_edges
+ elif normed:
+ # deprecated, buggy behavior. Remove for NumPy 2.0.0
+ db = np.array(np.diff(bin_edges), float)
+ return n/(n*db).sum(), bin_edges
+ else:
+ return n, bin_edges
+
+
+def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
+ """
+ Compute the multidimensional histogram of some data.
+
+ Parameters
+ ----------
+ sample : array_like
+ The data to be histogrammed. It must be an (N,D) array or data
+ that can be converted to such. The rows of the resulting array
+ are the coordinates of points in a D dimensional polytope.
+ bins : sequence or int, optional
+ The bin specification:
+
+ * A sequence of arrays describing the bin edges along each dimension.
+ * The number of bins for each dimension (nx, ny, ... =bins)
+ * The number of bins for all dimensions (nx=ny=...=bins).
+
+ range : sequence, optional
+ A sequence of lower and upper bin edges to be used if the edges are
+ not given explicitly in `bins`. Defaults to the minimum and maximum
+ values along each dimension.
+ normed : bool, optional
+ If False, returns the number of samples in each bin. If True,
+ returns the bin density ``bin_count / sample_count / bin_volume``.
+ weights : (N,) array_like, optional
+ An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
+ Weights are normalized to 1 if normed is True. If normed is False,
+ the values of the returned histogram are equal to the sum of the
+ weights belonging to the samples falling into each bin.
+
+ Returns
+ -------
+ H : ndarray
+ The multidimensional histogram of sample x. See normed and weights
+ for the different possible semantics.
+ edges : list
+ A list of D arrays describing the bin edges for each dimension.
+
+ See Also
+ --------
+ histogram: 1-D histogram
+ histogram2d: 2-D histogram
+
+ Examples
+ --------
+ >>> r = np.random.randn(100,3)
+ >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
+ >>> H.shape, edges[0].size, edges[1].size, edges[2].size
+ ((5, 8, 4), 6, 9, 5)
+
+ """
+
+ try:
+ # Sample is an ND-array.
+ N, D = sample.shape
+ except (AttributeError, ValueError):
+ # Sample is a sequence of 1D arrays.
+ sample = np.atleast_2d(sample).T
+ N, D = sample.shape
+
+ nbin = np.empty(D, int)
+ edges = D*[None]
+ dedges = D*[None]
+ if weights is not None:
+ weights = np.asarray(weights)
+
+ try:
+ M = len(bins)
+ if M != D:
+ raise ValueError(
+ 'The dimension of bins must be equal to the dimension of the '
+ ' sample x.')
+ except TypeError:
+ # bins is an integer
+ bins = D*[bins]
+
+ # Select range for each dimension
+ # Used only if number of bins is given.
+ if range is None:
+ # Handle empty input. Range can't be determined in that case, use 0-1.
+ if N == 0:
+ smin = np.zeros(D)
+ smax = np.ones(D)
+ else:
+ smin = np.atleast_1d(np.array(sample.min(0), float))
+ smax = np.atleast_1d(np.array(sample.max(0), float))
+ else:
+ if not np.all(np.isfinite(range)):
+ raise ValueError(
+ 'range parameter must be finite.')
+ smin = np.zeros(D)
+ smax = np.zeros(D)
+ for i in np.arange(D):
+ smin[i], smax[i] = range[i]
+
+ # Make sure the bins have a finite width.
+ for i in np.arange(len(smin)):
+ if smin[i] == smax[i]:
+ smin[i] = smin[i] - .5
+ smax[i] = smax[i] + .5
+
+ # avoid rounding issues for comparisons when dealing with inexact types
+ if np.issubdtype(sample.dtype, np.inexact):
+ edge_dt = sample.dtype
+ else:
+ edge_dt = float
+ # Create edge arrays
+ for i in np.arange(D):
+ if np.isscalar(bins[i]):
+ if bins[i] < 1:
+ raise ValueError(
+ "Element at index %s in `bins` should be a positive "
+ "integer." % i)
+ nbin[i] = bins[i] + 2 # +2 for outlier bins
+ edges[i] = np.linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt)
+ else:
+ edges[i] = np.asarray(bins[i], edge_dt)
+ nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
+ dedges[i] = np.diff(edges[i])
+ if np.any(np.asarray(dedges[i]) <= 0):
+ raise ValueError(
+ "Found bin edge of size <= 0. Did you specify `bins` with"
+ "non-monotonic sequence?")
+
+ nbin = np.asarray(nbin)
+
+ # Handle empty input.
+ if N == 0:
+ return np.zeros(nbin-2), edges
+
+ # Compute the bin number each sample falls into.
+ Ncount = {}
+ for i in np.arange(D):
+ Ncount[i] = np.digitize(sample[:, i], edges[i])
+
+ # Using digitize, values that fall on an edge are put in the right bin.
+ # For the rightmost bin, we want values equal to the right edge to be
+ # counted in the last bin, and not as an outlier.
+ for i in np.arange(D):
+ # Rounding precision
+ mindiff = dedges[i].min()
+ if not np.isinf(mindiff):
+ decimal = int(-np.log10(mindiff)) + 6
+ # Find which points are on the rightmost edge.
+ not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
+ on_edge = (np.around(sample[:, i], decimal) ==
+ np.around(edges[i][-1], decimal))
+ # Shift these points one bin to the left.
+ Ncount[i][np.nonzero(on_edge & not_smaller_than_edge)[0]] -= 1
+
+ # Flattened histogram matrix (1D)
+ # Reshape is used so that overlarge arrays
+ # will raise an error.
+ hist = np.zeros(nbin, float).reshape(-1)
+
+ # Compute the sample indices in the flattened histogram matrix.
+ ni = nbin.argsort()
+ xy = np.zeros(N, int)
+ for i in np.arange(0, D-1):
+ xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod()
+ xy += Ncount[ni[-1]]
+
+ # Compute the number of repetitions in xy and assign it to the
+ # flattened histmat.
+ if len(xy) == 0:
+ return np.zeros(nbin-2, int), edges
+
+ flatcount = np.bincount(xy, weights)
+ a = np.arange(len(flatcount))
+ hist[a] = flatcount
+
+ # Shape into a proper matrix
+ hist = hist.reshape(np.sort(nbin))
+ for i in np.arange(nbin.size):
+ j = ni.argsort()[i]
+ hist = hist.swapaxes(i, j)
+ ni[i], ni[j] = ni[j], ni[i]
+
+ # Remove outliers (indices 0 and -1 for each dimension).
+ core = D*[slice(1, -1)]
+ hist = hist[core]
+
+ # Normalize if normed is True
+ if normed:
+ s = hist.sum()
+ for i in np.arange(D):
+ shape = np.ones(D, int)
+ shape[i] = nbin[i] - 2
+ hist = hist / dedges[i].reshape(shape)
+ hist /= s
+
+ if (hist.shape != nbin - 2).any():
+ raise RuntimeError(
+ "Internal Shape Error")
+ return hist, edges
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 8381c2465..dbab69436 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1617,518 +1617,6 @@ class TestSinc(object):
assert_array_equal(y1, y3)
-class TestHistogram(object):
-
- def setup(self):
- pass
-
- def teardown(self):
- pass
-
- def test_simple(self):
- n = 100
- v = rand(n)
- (a, b) = histogram(v)
- # check if the sum of the bins equals the number of samples
- assert_equal(np.sum(a, axis=0), n)
- # check that the bin counts are evenly spaced when the data is from
- # a linear function
- (a, b) = histogram(np.linspace(0, 10, 100))
- assert_array_equal(a, 10)
-
- def test_one_bin(self):
- # Ticket 632
- hist, edges = histogram([1, 2, 3, 4], [1, 2])
- assert_array_equal(hist, [2, ])
- assert_array_equal(edges, [1, 2])
- assert_raises(ValueError, histogram, [1, 2], bins=0)
- h, e = histogram([1, 2], bins=1)
- assert_equal(h, np.array([2]))
- assert_allclose(e, np.array([1., 2.]))
-
- def test_normed(self):
- # Check that the integral of the density equals 1.
- n = 100
- v = rand(n)
- a, b = histogram(v, normed=True)
- area = np.sum(a * diff(b))
- assert_almost_equal(area, 1)
-
- # Check with non-constant bin widths (buggy but backwards
- # compatible)
- v = np.arange(10)
- bins = [0, 1, 5, 9, 10]
- a, b = histogram(v, bins, normed=True)
- area = np.sum(a * diff(b))
- assert_almost_equal(area, 1)
-
- def test_density(self):
- # Check that the integral of the density equals 1.
- n = 100
- v = rand(n)
- a, b = histogram(v, density=True)
- area = np.sum(a * diff(b))
- assert_almost_equal(area, 1)
-
- # Check with non-constant bin widths
- v = np.arange(10)
- bins = [0, 1, 3, 6, 10]
- a, b = histogram(v, bins, density=True)
- assert_array_equal(a, .1)
- assert_equal(np.sum(a * diff(b)), 1)
-
- # Variale bin widths are especially useful to deal with
- # infinities.
- v = np.arange(10)
- bins = [0, 1, 3, 6, np.inf]
- a, b = histogram(v, bins, density=True)
- assert_array_equal(a, [.1, .1, .1, 0.])
-
- # Taken from a bug report from N. Becker on the numpy-discussion
- # mailing list Aug. 6, 2010.
- counts, dmy = np.histogram(
- [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True)
- assert_equal(counts, [.25, 0])
-
- def test_outliers(self):
- # Check that outliers are not tallied
- a = np.arange(10) + .5
-
- # Lower outliers
- h, b = histogram(a, range=[0, 9])
- assert_equal(h.sum(), 9)
-
- # Upper outliers
- h, b = histogram(a, range=[1, 10])
- assert_equal(h.sum(), 9)
-
- # Normalization
- h, b = histogram(a, range=[1, 9], normed=True)
- assert_almost_equal((h * diff(b)).sum(), 1, decimal=15)
-
- # Weights
- w = np.arange(10) + .5
- h, b = histogram(a, range=[1, 9], weights=w, normed=True)
- assert_equal((h * diff(b)).sum(), 1)
-
- h, b = histogram(a, bins=8, range=[1, 9], weights=w)
- assert_equal(h, w[1:-1])
-
- def test_type(self):
- # Check the type of the returned histogram
- a = np.arange(10) + .5
- h, b = histogram(a)
- assert_(np.issubdtype(h.dtype, np.integer))
-
- h, b = histogram(a, normed=True)
- assert_(np.issubdtype(h.dtype, np.floating))
-
- h, b = histogram(a, weights=np.ones(10, int))
- assert_(np.issubdtype(h.dtype, np.integer))
-
- h, b = histogram(a, weights=np.ones(10, float))
- assert_(np.issubdtype(h.dtype, np.floating))
-
- def test_f32_rounding(self):
- # gh-4799, check that the rounding of the edges works with float32
- x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32)
- y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32)
- counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100)
- assert_equal(counts_hist.sum(), 3.)
-
- def test_weights(self):
- v = rand(100)
- w = np.ones(100) * 5
- a, b = histogram(v)
- na, nb = histogram(v, normed=True)
- wa, wb = histogram(v, weights=w)
- nwa, nwb = histogram(v, weights=w, normed=True)
- assert_array_almost_equal(a * 5, wa)
- assert_array_almost_equal(na, nwa)
-
- # Check weights are properly applied.
- v = np.linspace(0, 10, 10)
- w = np.concatenate((np.zeros(5), np.ones(5)))
- wa, wb = histogram(v, bins=np.arange(11), weights=w)
- assert_array_almost_equal(wa, w)
-
- # Check with integer weights
- wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1])
- assert_array_equal(wa, [4, 5, 0, 1])
- wa, wb = histogram(
- [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True)
- assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4)
-
- # Check weights with non-uniform bin widths
- a, b = histogram(
- np.arange(9), [0, 1, 3, 6, 10],
- weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True)
- assert_almost_equal(a, [.2, .1, .1, .075])
-
- def test_exotic_weights(self):
-
- # Test the use of weights that are not integer or floats, but e.g.
- # complex numbers or object types.
-
- # Complex weights
- values = np.array([1.3, 2.5, 2.3])
- weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2])
-
- # Check with custom bins
- wa, wb = histogram(values, bins=[0, 2, 3], weights=weights)
- assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3]))
-
- # Check with even bins
- wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights)
- assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3]))
-
- # Decimal weights
- from decimal import Decimal
- values = np.array([1.3, 2.5, 2.3])
- weights = np.array([Decimal(1), Decimal(2), Decimal(3)])
-
- # Check with custom bins
- wa, wb = histogram(values, bins=[0, 2, 3], weights=weights)
- assert_array_almost_equal(wa, [Decimal(1), Decimal(5)])
-
- # Check with even bins
- wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights)
- assert_array_almost_equal(wa, [Decimal(1), Decimal(5)])
-
- def test_no_side_effects(self):
- # This is a regression test that ensures that values passed to
- # ``histogram`` are unchanged.
- values = np.array([1.3, 2.5, 2.3])
- np.histogram(values, range=[-10, 10], bins=100)
- assert_array_almost_equal(values, [1.3, 2.5, 2.3])
-
- def test_empty(self):
- a, b = histogram([], bins=([0, 1]))
- assert_array_equal(a, np.array([0]))
- assert_array_equal(b, np.array([0, 1]))
-
- def test_error_binnum_type (self):
- # Tests if right Error is raised if bins argument is float
- vals = np.linspace(0.0, 1.0, num=100)
- histogram(vals, 5)
- assert_raises(TypeError, histogram, vals, 2.4)
-
- def test_finite_range(self):
- # Normal ranges should be fine
- vals = np.linspace(0.0, 1.0, num=100)
- histogram(vals, range=[0.25,0.75])
- assert_raises(ValueError, histogram, vals, range=[np.nan,0.75])
- assert_raises(ValueError, histogram, vals, range=[0.25,np.inf])
-
- def test_bin_edge_cases(self):
- # Ensure that floating-point computations correctly place edge cases.
- arr = np.array([337, 404, 739, 806, 1007, 1811, 2012])
- hist, edges = np.histogram(arr, bins=8296, range=(2, 2280))
- mask = hist > 0
- left_edges = edges[:-1][mask]
- right_edges = edges[1:][mask]
- for x, left, right in zip(arr, left_edges, right_edges):
- assert_(x >= left)
- assert_(x < right)
-
- def test_last_bin_inclusive_range(self):
- arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.])
- hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5))
- assert_equal(hist[-1], 1)
-
- def test_unsigned_monotonicity_check(self):
- # Ensures ValueError is raised if bins not increasing monotonically
- # when bins contain unsigned values (see #9222)
- arr = np.array([2])
- bins = np.array([1, 3, 1], dtype='uint64')
- with assert_raises(ValueError):
- hist, edges = np.histogram(arr, bins=bins)
-
-
-class TestHistogramOptimBinNums(object):
- """
- Provide test coverage when using provided estimators for optimal number of
- bins
- """
-
- def test_empty(self):
- estimator_list = ['fd', 'scott', 'rice', 'sturges',
- 'doane', 'sqrt', 'auto']
- # check it can deal with empty data
- for estimator in estimator_list:
- a, b = histogram([], bins=estimator)
- assert_array_equal(a, np.array([0]))
- assert_array_equal(b, np.array([0, 1]))
-
- def test_simple(self):
- """
- Straightforward testing with a mixture of linspace data (for
- consistency). All test values have been precomputed and the values
- shouldn't change
- """
- # Some basic sanity checking, with some fixed data.
- # Checking for the correct number of bins
- basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7,
- 'doane': 8, 'sqrt': 8, 'auto': 7},
- 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10,
- 'doane': 12, 'sqrt': 23, 'auto': 10},
- 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14,
- 'doane': 17, 'sqrt': 71, 'auto': 17}}
-
- for testlen, expectedResults in basic_test.items():
- # Create some sort of non uniform data to test with
- # (2 peak uniform mixture)
- x1 = np.linspace(-10, -1, testlen // 5 * 2)
- x2 = np.linspace(1, 10, testlen // 5 * 3)
- x = np.concatenate((x1, x2))
- for estimator, numbins in expectedResults.items():
- a, b = np.histogram(x, estimator)
- assert_equal(len(a), numbins, err_msg="For the {0} estimator "
- "with datasize of {1}".format(estimator, testlen))
-
- def test_small(self):
- """
- Smaller datasets have the potential to cause issues with the data
- adaptive methods, especially the FD method. All bin numbers have been
- precalculated.
- """
- small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
- 'doane': 1, 'sqrt': 1},
- 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
- 'doane': 1, 'sqrt': 2},
- 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3,
- 'doane': 3, 'sqrt': 2}}
-
- for testlen, expectedResults in small_dat.items():
- testdat = np.arange(testlen)
- for estimator, expbins in expectedResults.items():
- a, b = np.histogram(testdat, estimator)
- assert_equal(len(a), expbins, err_msg="For the {0} estimator "
- "with datasize of {1}".format(estimator, testlen))
-
- def test_incorrect_methods(self):
- """
- Check a Value Error is thrown when an unknown string is passed in
- """
- check_list = ['mad', 'freeman', 'histograms', 'IQR']
- for estimator in check_list:
- assert_raises(ValueError, histogram, [1, 2, 3], estimator)
-
- def test_novariance(self):
- """
- Check that methods handle no variance in data
- Primarily for Scott and FD as the SD and IQR are both 0 in this case
- """
- novar_dataset = np.ones(100)
- novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
- 'doane': 1, 'sqrt': 1, 'auto': 1}
-
- for estimator, numbins in novar_resultdict.items():
- a, b = np.histogram(novar_dataset, estimator)
- assert_equal(len(a), numbins, err_msg="{0} estimator, "
- "No Variance test".format(estimator))
-
- def test_outlier(self):
- """
- Check the FD, Scott and Doane with outliers.
-
- The FD estimates a smaller binwidth since it's less affected by
- outliers. Since the range is so (artificially) large, this means more
- bins, most of which will be empty, but the data of interest usually is
- unaffected. The Scott estimator is more affected and returns fewer bins,
- despite most of the variance being in one area of the data. The Doane
- estimator lies somewhere between the other two.
- """
- xcenter = np.linspace(-10, 10, 50)
- outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
-
- outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11}
-
- for estimator, numbins in outlier_resultdict.items():
- a, b = np.histogram(outlier_dataset, estimator)
- assert_equal(len(a), numbins)
-
- def test_simple_range(self):
- """
- Straightforward testing with a mixture of linspace data (for
- consistency). Adding in a 3rd mixture that will then be
- completely ignored. All test values have been precomputed and
- the shouldn't change.
- """
- # some basic sanity checking, with some fixed data.
- # Checking for the correct number of bins
- basic_test = {
- 50: {'fd': 8, 'scott': 8, 'rice': 15,
- 'sturges': 14, 'auto': 14},
- 500: {'fd': 15, 'scott': 16, 'rice': 32,
- 'sturges': 20, 'auto': 20},
- 5000: {'fd': 33, 'scott': 33, 'rice': 69,
- 'sturges': 27, 'auto': 33}
- }
-
- for testlen, expectedResults in basic_test.items():
- # create some sort of non uniform data to test with
- # (3 peak uniform mixture)
- x1 = np.linspace(-10, -1, testlen // 5 * 2)
- x2 = np.linspace(1, 10, testlen // 5 * 3)
- x3 = np.linspace(-100, -50, testlen)
- x = np.hstack((x1, x2, x3))
- for estimator, numbins in expectedResults.items():
- a, b = np.histogram(x, estimator, range = (-20, 20))
- msg = "For the {0} estimator".format(estimator)
- msg += " with datasize of {0}".format(testlen)
- assert_equal(len(a), numbins, err_msg=msg)
-
- def test_simple_weighted(self):
- """
- Check that weighted data raises a TypeError
- """
- estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
- for estimator in estimator_list:
- assert_raises(TypeError, histogram, [1, 2, 3],
- estimator, weights=[1, 2, 3])
-
-
-class TestHistogramdd(object):
-
- def test_simple(self):
- x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5],
- [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]])
- H, edges = histogramdd(x, (2, 3, 3),
- range=[[-1, 1], [0, 3], [0, 3]])
- answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]],
- [[0, 1, 0], [0, 0, 1], [0, 0, 1]]])
- assert_array_equal(H, answer)
-
- # Check normalization
- ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]]
- H, edges = histogramdd(x, bins=ed, normed=True)
- assert_(np.all(H == answer / 12.))
-
- # Check that H has the correct shape.
- H, edges = histogramdd(x, (2, 3, 4),
- range=[[-1, 1], [0, 3], [0, 4]],
- normed=True)
- answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]],
- [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]])
- assert_array_almost_equal(H, answer / 6., 4)
- # Check that a sequence of arrays is accepted and H has the correct
- # shape.
- z = [np.squeeze(y) for y in split(x, 3, axis=1)]
- H, edges = histogramdd(
- z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]])
- answer = np.array([[[0, 0], [0, 0], [0, 0]],
- [[0, 1], [0, 0], [1, 0]],
- [[0, 1], [0, 0], [0, 0]],
- [[0, 0], [0, 0], [0, 0]]])
- assert_array_equal(H, answer)
-
- Z = np.zeros((5, 5, 5))
- Z[list(range(5)), list(range(5)), list(range(5))] = 1.
- H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5)
- assert_array_equal(H, Z)
-
- def test_shape_3d(self):
- # All possible permutations for bins of different lengths in 3D.
- bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4),
- (4, 5, 6))
- r = rand(10, 3)
- for b in bins:
- H, edges = histogramdd(r, b)
- assert_(H.shape == b)
-
- def test_shape_4d(self):
- # All possible permutations for bins of different lengths in 4D.
- bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4),
- (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6),
- (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7),
- (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5),
- (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5),
- (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4))
-
- r = rand(10, 4)
- for b in bins:
- H, edges = histogramdd(r, b)
- assert_(H.shape == b)
-
- def test_weights(self):
- v = rand(100, 2)
- hist, edges = histogramdd(v)
- n_hist, edges = histogramdd(v, normed=True)
- w_hist, edges = histogramdd(v, weights=np.ones(100))
- assert_array_equal(w_hist, hist)
- w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True)
- assert_array_equal(w_hist, n_hist)
- w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2)
- assert_array_equal(w_hist, 2 * hist)
-
- def test_identical_samples(self):
- x = np.zeros((10, 2), int)
- hist, edges = histogramdd(x, bins=2)
- assert_array_equal(edges[0], np.array([-0.5, 0., 0.5]))
-
- def test_empty(self):
- a, b = histogramdd([[], []], bins=([0, 1], [0, 1]))
- assert_array_max_ulp(a, np.array([[0.]]))
- a, b = np.histogramdd([[], [], []], bins=2)
- assert_array_max_ulp(a, np.zeros((2, 2, 2)))
-
- def test_bins_errors(self):
- # There are two ways to specify bins. Check for the right errors
- # when mixing those.
- x = np.arange(8).reshape(2, 4)
- assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
- assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
- assert_raises(
- ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
- assert_raises(
- ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
- assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
-
- def test_inf_edges(self):
- # Test using +/-inf bin edges works. See #1788.
- with np.errstate(invalid='ignore'):
- x = np.arange(6).reshape(3, 2)
- expected = np.array([[1, 0], [0, 1], [0, 1]])
- h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]])
- assert_allclose(h, expected)
- h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])])
- assert_allclose(h, expected)
- h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]])
- assert_allclose(h, expected)
-
- def test_rightmost_binedge(self):
- # Test event very close to rightmost binedge. See Github issue #4266
- x = [0.9999999995]
- bins = [[0., 0.5, 1.0]]
- hist, _ = histogramdd(x, bins=bins)
- assert_(hist[0] == 0.0)
- assert_(hist[1] == 1.)
- x = [1.0]
- bins = [[0., 0.5, 1.0]]
- hist, _ = histogramdd(x, bins=bins)
- assert_(hist[0] == 0.0)
- assert_(hist[1] == 1.)
- x = [1.0000000001]
- bins = [[0., 0.5, 1.0]]
- hist, _ = histogramdd(x, bins=bins)
- assert_(hist[0] == 0.0)
- assert_(hist[1] == 1.)
- x = [1.0001]
- bins = [[0., 0.5, 1.0]]
- hist, _ = histogramdd(x, bins=bins)
- assert_(hist[0] == 0.0)
- assert_(hist[1] == 0.0)
-
- def test_finite_range(self):
- vals = np.random.random((100, 3))
- histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]])
- assert_raises(ValueError, histogramdd, vals,
- range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
- assert_raises(ValueError, histogramdd, vals,
- range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
-
-
class TestUnique(object):
def test_simple(self):
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
new file mode 100644
index 000000000..59baf91fe
--- /dev/null
+++ b/numpy/lib/tests/test_histograms.py
@@ -0,0 +1,527 @@
+from __future__ import division, absolute_import, print_function
+
+import numpy as np
+
+from numpy.lib.histograms import histogram, histogramdd
+from numpy.testing import (
+ run_module_suite, assert_, assert_equal, assert_array_equal,
+ assert_almost_equal, assert_array_almost_equal, assert_raises,
+ assert_allclose, assert_array_max_ulp, assert_warns, assert_raises_regex,
+ dec, suppress_warnings, HAS_REFCOUNT,
+)
+
+
+class TestHistogram(object):
+
+ def setup(self):
+ pass
+
+ def teardown(self):
+ pass
+
+ def test_simple(self):
+ n = 100
+ v = np.random.rand(n)
+ (a, b) = histogram(v)
+ # check if the sum of the bins equals the number of samples
+ assert_equal(np.sum(a, axis=0), n)
+ # check that the bin counts are evenly spaced when the data is from
+ # a linear function
+ (a, b) = histogram(np.linspace(0, 10, 100))
+ assert_array_equal(a, 10)
+
+ def test_one_bin(self):
+ # Ticket 632
+ hist, edges = histogram([1, 2, 3, 4], [1, 2])
+ assert_array_equal(hist, [2, ])
+ assert_array_equal(edges, [1, 2])
+ assert_raises(ValueError, histogram, [1, 2], bins=0)
+ h, e = histogram([1, 2], bins=1)
+ assert_equal(h, np.array([2]))
+ assert_allclose(e, np.array([1., 2.]))
+
+ def test_normed(self):
+ # Check that the integral of the density equals 1.
+ n = 100
+ v = np.random.rand(n)
+ a, b = histogram(v, normed=True)
+ area = np.sum(a * np.diff(b))
+ assert_almost_equal(area, 1)
+
+ # Check with non-constant bin widths (buggy but backwards
+ # compatible)
+ v = np.arange(10)
+ bins = [0, 1, 5, 9, 10]
+ a, b = histogram(v, bins, normed=True)
+ area = np.sum(a * np.diff(b))
+ assert_almost_equal(area, 1)
+
+ def test_density(self):
+ # Check that the integral of the density equals 1.
+ n = 100
+ v = np.random.rand(n)
+ a, b = histogram(v, density=True)
+ area = np.sum(a * np.diff(b))
+ assert_almost_equal(area, 1)
+
+ # Check with non-constant bin widths
+ v = np.arange(10)
+ bins = [0, 1, 3, 6, 10]
+ a, b = histogram(v, bins, density=True)
+ assert_array_equal(a, .1)
+ assert_equal(np.sum(a * np.diff(b)), 1)
+
+ # Variale bin widths are especially useful to deal with
+ # infinities.
+ v = np.arange(10)
+ bins = [0, 1, 3, 6, np.inf]
+ a, b = histogram(v, bins, density=True)
+ assert_array_equal(a, [.1, .1, .1, 0.])
+
+ # Taken from a bug report from N. Becker on the numpy-discussion
+ # mailing list Aug. 6, 2010.
+ counts, dmy = np.histogram(
+ [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True)
+ assert_equal(counts, [.25, 0])
+
+ def test_outliers(self):
+ # Check that outliers are not tallied
+ a = np.arange(10) + .5
+
+ # Lower outliers
+ h, b = histogram(a, range=[0, 9])
+ assert_equal(h.sum(), 9)
+
+ # Upper outliers
+ h, b = histogram(a, range=[1, 10])
+ assert_equal(h.sum(), 9)
+
+ # Normalization
+ h, b = histogram(a, range=[1, 9], normed=True)
+ assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15)
+
+ # Weights
+ w = np.arange(10) + .5
+ h, b = histogram(a, range=[1, 9], weights=w, normed=True)
+ assert_equal((h * np.diff(b)).sum(), 1)
+
+ h, b = histogram(a, bins=8, range=[1, 9], weights=w)
+ assert_equal(h, w[1:-1])
+
+ def test_type(self):
+ # Check the type of the returned histogram
+ a = np.arange(10) + .5
+ h, b = histogram(a)
+ assert_(np.issubdtype(h.dtype, np.integer))
+
+ h, b = histogram(a, normed=True)
+ assert_(np.issubdtype(h.dtype, np.floating))
+
+ h, b = histogram(a, weights=np.ones(10, int))
+ assert_(np.issubdtype(h.dtype, np.integer))
+
+ h, b = histogram(a, weights=np.ones(10, float))
+ assert_(np.issubdtype(h.dtype, np.floating))
+
+ def test_f32_rounding(self):
+ # gh-4799, check that the rounding of the edges works with float32
+ x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32)
+ y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32)
+ counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100)
+ assert_equal(counts_hist.sum(), 3.)
+
+ def test_weights(self):
+ v = np.random.rand(100)
+ w = np.ones(100) * 5
+ a, b = histogram(v)
+ na, nb = histogram(v, normed=True)
+ wa, wb = histogram(v, weights=w)
+ nwa, nwb = histogram(v, weights=w, normed=True)
+ assert_array_almost_equal(a * 5, wa)
+ assert_array_almost_equal(na, nwa)
+
+ # Check weights are properly applied.
+ v = np.linspace(0, 10, 10)
+ w = np.concatenate((np.zeros(5), np.ones(5)))
+ wa, wb = histogram(v, bins=np.arange(11), weights=w)
+ assert_array_almost_equal(wa, w)
+
+ # Check with integer weights
+ wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1])
+ assert_array_equal(wa, [4, 5, 0, 1])
+ wa, wb = histogram(
+ [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True)
+ assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4)
+
+ # Check weights with non-uniform bin widths
+ a, b = histogram(
+ np.arange(9), [0, 1, 3, 6, 10],
+ weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True)
+ assert_almost_equal(a, [.2, .1, .1, .075])
+
+ def test_exotic_weights(self):
+
+ # Test the use of weights that are not integer or floats, but e.g.
+ # complex numbers or object types.
+
+ # Complex weights
+ values = np.array([1.3, 2.5, 2.3])
+ weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2])
+
+ # Check with custom bins
+ wa, wb = histogram(values, bins=[0, 2, 3], weights=weights)
+ assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3]))
+
+ # Check with even bins
+ wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights)
+ assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3]))
+
+ # Decimal weights
+ from decimal import Decimal
+ values = np.array([1.3, 2.5, 2.3])
+ weights = np.array([Decimal(1), Decimal(2), Decimal(3)])
+
+ # Check with custom bins
+ wa, wb = histogram(values, bins=[0, 2, 3], weights=weights)
+ assert_array_almost_equal(wa, [Decimal(1), Decimal(5)])
+
+ # Check with even bins
+ wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights)
+ assert_array_almost_equal(wa, [Decimal(1), Decimal(5)])
+
+ def test_no_side_effects(self):
+ # This is a regression test that ensures that values passed to
+ # ``histogram`` are unchanged.
+ values = np.array([1.3, 2.5, 2.3])
+ np.histogram(values, range=[-10, 10], bins=100)
+ assert_array_almost_equal(values, [1.3, 2.5, 2.3])
+
+ def test_empty(self):
+ a, b = histogram([], bins=([0, 1]))
+ assert_array_equal(a, np.array([0]))
+ assert_array_equal(b, np.array([0, 1]))
+
+ def test_error_binnum_type (self):
+ # Tests if right Error is raised if bins argument is float
+ vals = np.linspace(0.0, 1.0, num=100)
+ histogram(vals, 5)
+ assert_raises(TypeError, histogram, vals, 2.4)
+
+ def test_finite_range(self):
+ # Normal ranges should be fine
+ vals = np.linspace(0.0, 1.0, num=100)
+ histogram(vals, range=[0.25,0.75])
+ assert_raises(ValueError, histogram, vals, range=[np.nan,0.75])
+ assert_raises(ValueError, histogram, vals, range=[0.25,np.inf])
+
+ def test_bin_edge_cases(self):
+ # Ensure that floating-point computations correctly place edge cases.
+ arr = np.array([337, 404, 739, 806, 1007, 1811, 2012])
+ hist, edges = np.histogram(arr, bins=8296, range=(2, 2280))
+ mask = hist > 0
+ left_edges = edges[:-1][mask]
+ right_edges = edges[1:][mask]
+ for x, left, right in zip(arr, left_edges, right_edges):
+ assert_(x >= left)
+ assert_(x < right)
+
+ def test_last_bin_inclusive_range(self):
+ arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.])
+ hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5))
+ assert_equal(hist[-1], 1)
+
+ def test_unsigned_monotonicity_check(self):
+ # Ensures ValueError is raised if bins not increasing monotonically
+ # when bins contain unsigned values (see #9222)
+ arr = np.array([2])
+ bins = np.array([1, 3, 1], dtype='uint64')
+ with assert_raises(ValueError):
+ hist, edges = np.histogram(arr, bins=bins)
+
+
+class TestHistogramOptimBinNums(object):
+ """
+ Provide test coverage when using provided estimators for optimal number of
+ bins
+ """
+
+ def test_empty(self):
+ estimator_list = ['fd', 'scott', 'rice', 'sturges',
+ 'doane', 'sqrt', 'auto']
+ # check it can deal with empty data
+ for estimator in estimator_list:
+ a, b = histogram([], bins=estimator)
+ assert_array_equal(a, np.array([0]))
+ assert_array_equal(b, np.array([0, 1]))
+
+ def test_simple(self):
+ """
+ Straightforward testing with a mixture of linspace data (for
+ consistency). All test values have been precomputed and the values
+ shouldn't change
+ """
+ # Some basic sanity checking, with some fixed data.
+ # Checking for the correct number of bins
+ basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7,
+ 'doane': 8, 'sqrt': 8, 'auto': 7},
+ 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10,
+ 'doane': 12, 'sqrt': 23, 'auto': 10},
+ 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14,
+ 'doane': 17, 'sqrt': 71, 'auto': 17}}
+
+ for testlen, expectedResults in basic_test.items():
+ # Create some sort of non uniform data to test with
+ # (2 peak uniform mixture)
+ x1 = np.linspace(-10, -1, testlen // 5 * 2)
+ x2 = np.linspace(1, 10, testlen // 5 * 3)
+ x = np.concatenate((x1, x2))
+ for estimator, numbins in expectedResults.items():
+ a, b = np.histogram(x, estimator)
+ assert_equal(len(a), numbins, err_msg="For the {0} estimator "
+ "with datasize of {1}".format(estimator, testlen))
+
+ def test_small(self):
+ """
+ Smaller datasets have the potential to cause issues with the data
+ adaptive methods, especially the FD method. All bin numbers have been
+ precalculated.
+ """
+ small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
+ 'doane': 1, 'sqrt': 1},
+ 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
+ 'doane': 1, 'sqrt': 2},
+ 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3,
+ 'doane': 3, 'sqrt': 2}}
+
+ for testlen, expectedResults in small_dat.items():
+ testdat = np.arange(testlen)
+ for estimator, expbins in expectedResults.items():
+ a, b = np.histogram(testdat, estimator)
+ assert_equal(len(a), expbins, err_msg="For the {0} estimator "
+ "with datasize of {1}".format(estimator, testlen))
+
+ def test_incorrect_methods(self):
+ """
+ Check a Value Error is thrown when an unknown string is passed in
+ """
+ check_list = ['mad', 'freeman', 'histograms', 'IQR']
+ for estimator in check_list:
+ assert_raises(ValueError, histogram, [1, 2, 3], estimator)
+
+ def test_novariance(self):
+ """
+ Check that methods handle no variance in data
+ Primarily for Scott and FD as the SD and IQR are both 0 in this case
+ """
+ novar_dataset = np.ones(100)
+ novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
+ 'doane': 1, 'sqrt': 1, 'auto': 1}
+
+ for estimator, numbins in novar_resultdict.items():
+ a, b = np.histogram(novar_dataset, estimator)
+ assert_equal(len(a), numbins, err_msg="{0} estimator, "
+ "No Variance test".format(estimator))
+
+ def test_outlier(self):
+ """
+ Check the FD, Scott and Doane with outliers.
+
+ The FD estimates a smaller binwidth since it's less affected by
+ outliers. Since the range is so (artificially) large, this means more
+ bins, most of which will be empty, but the data of interest usually is
+ unaffected. The Scott estimator is more affected and returns fewer bins,
+ despite most of the variance being in one area of the data. The Doane
+ estimator lies somewhere between the other two.
+ """
+ xcenter = np.linspace(-10, 10, 50)
+ outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
+
+ outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11}
+
+ for estimator, numbins in outlier_resultdict.items():
+ a, b = np.histogram(outlier_dataset, estimator)
+ assert_equal(len(a), numbins)
+
+ def test_simple_range(self):
+ """
+ Straightforward testing with a mixture of linspace data (for
+ consistency). Adding in a 3rd mixture that will then be
+ completely ignored. All test values have been precomputed and
+ the shouldn't change.
+ """
+ # some basic sanity checking, with some fixed data.
+ # Checking for the correct number of bins
+ basic_test = {
+ 50: {'fd': 8, 'scott': 8, 'rice': 15,
+ 'sturges': 14, 'auto': 14},
+ 500: {'fd': 15, 'scott': 16, 'rice': 32,
+ 'sturges': 20, 'auto': 20},
+ 5000: {'fd': 33, 'scott': 33, 'rice': 69,
+ 'sturges': 27, 'auto': 33}
+ }
+
+ for testlen, expectedResults in basic_test.items():
+ # create some sort of non uniform data to test with
+ # (3 peak uniform mixture)
+ x1 = np.linspace(-10, -1, testlen // 5 * 2)
+ x2 = np.linspace(1, 10, testlen // 5 * 3)
+ x3 = np.linspace(-100, -50, testlen)
+ x = np.hstack((x1, x2, x3))
+ for estimator, numbins in expectedResults.items():
+ a, b = np.histogram(x, estimator, range = (-20, 20))
+ msg = "For the {0} estimator".format(estimator)
+ msg += " with datasize of {0}".format(testlen)
+ assert_equal(len(a), numbins, err_msg=msg)
+
+ def test_simple_weighted(self):
+ """
+ Check that weighted data raises a TypeError
+ """
+ estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
+ for estimator in estimator_list:
+ assert_raises(TypeError, histogram, [1, 2, 3],
+ estimator, weights=[1, 2, 3])
+
+
+class TestHistogramdd(object):
+
+ def test_simple(self):
+ x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5],
+ [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]])
+ H, edges = histogramdd(x, (2, 3, 3),
+ range=[[-1, 1], [0, 3], [0, 3]])
+ answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]],
+ [[0, 1, 0], [0, 0, 1], [0, 0, 1]]])
+ assert_array_equal(H, answer)
+
+ # Check normalization
+ ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]]
+ H, edges = histogramdd(x, bins=ed, normed=True)
+ assert_(np.all(H == answer / 12.))
+
+ # Check that H has the correct shape.
+ H, edges = histogramdd(x, (2, 3, 4),
+ range=[[-1, 1], [0, 3], [0, 4]],
+ normed=True)
+ answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]],
+ [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]])
+ assert_array_almost_equal(H, answer / 6., 4)
+ # Check that a sequence of arrays is accepted and H has the correct
+ # shape.
+ z = [np.squeeze(y) for y in np.split(x, 3, axis=1)]
+ H, edges = histogramdd(
+ z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]])
+ answer = np.array([[[0, 0], [0, 0], [0, 0]],
+ [[0, 1], [0, 0], [1, 0]],
+ [[0, 1], [0, 0], [0, 0]],
+ [[0, 0], [0, 0], [0, 0]]])
+ assert_array_equal(H, answer)
+
+ Z = np.zeros((5, 5, 5))
+ Z[list(range(5)), list(range(5)), list(range(5))] = 1.
+ H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5)
+ assert_array_equal(H, Z)
+
+ def test_shape_3d(self):
+ # All possible permutations for bins of different lengths in 3D.
+ bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4),
+ (4, 5, 6))
+ r = np.random.rand(10, 3)
+ for b in bins:
+ H, edges = histogramdd(r, b)
+ assert_(H.shape == b)
+
+ def test_shape_4d(self):
+ # All possible permutations for bins of different lengths in 4D.
+ bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4),
+ (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6),
+ (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7),
+ (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5),
+ (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5),
+ (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4))
+
+ r = np.random.rand(10, 4)
+ for b in bins:
+ H, edges = histogramdd(r, b)
+ assert_(H.shape == b)
+
+ def test_weights(self):
+ v = np.random.rand(100, 2)
+ hist, edges = histogramdd(v)
+ n_hist, edges = histogramdd(v, normed=True)
+ w_hist, edges = histogramdd(v, weights=np.ones(100))
+ assert_array_equal(w_hist, hist)
+ w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True)
+ assert_array_equal(w_hist, n_hist)
+ w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2)
+ assert_array_equal(w_hist, 2 * hist)
+
+ def test_identical_samples(self):
+ x = np.zeros((10, 2), int)
+ hist, edges = histogramdd(x, bins=2)
+ assert_array_equal(edges[0], np.array([-0.5, 0., 0.5]))
+
+ def test_empty(self):
+ a, b = histogramdd([[], []], bins=([0, 1], [0, 1]))
+ assert_array_max_ulp(a, np.array([[0.]]))
+ a, b = np.histogramdd([[], [], []], bins=2)
+ assert_array_max_ulp(a, np.zeros((2, 2, 2)))
+
+ def test_bins_errors(self):
+ # There are two ways to specify bins. Check for the right errors
+ # when mixing those.
+ x = np.arange(8).reshape(2, 4)
+ assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
+ assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
+ assert_raises(
+ ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
+ assert_raises(
+ ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
+ assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
+
+ def test_inf_edges(self):
+ # Test using +/-inf bin edges works. See #1788.
+ with np.errstate(invalid='ignore'):
+ x = np.arange(6).reshape(3, 2)
+ expected = np.array([[1, 0], [0, 1], [0, 1]])
+ h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]])
+ assert_allclose(h, expected)
+ h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])])
+ assert_allclose(h, expected)
+ h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]])
+ assert_allclose(h, expected)
+
+ def test_rightmost_binedge(self):
+ # Test event very close to rightmost binedge. See Github issue #4266
+ x = [0.9999999995]
+ bins = [[0., 0.5, 1.0]]
+ hist, _ = histogramdd(x, bins=bins)
+ assert_(hist[0] == 0.0)
+ assert_(hist[1] == 1.)
+ x = [1.0]
+ bins = [[0., 0.5, 1.0]]
+ hist, _ = histogramdd(x, bins=bins)
+ assert_(hist[0] == 0.0)
+ assert_(hist[1] == 1.)
+ x = [1.0000000001]
+ bins = [[0., 0.5, 1.0]]
+ hist, _ = histogramdd(x, bins=bins)
+ assert_(hist[0] == 0.0)
+ assert_(hist[1] == 1.)
+ x = [1.0001]
+ bins = [[0., 0.5, 1.0]]
+ hist, _ = histogramdd(x, bins=bins)
+ assert_(hist[0] == 0.0)
+ assert_(hist[1] == 0.0)
+
+ def test_finite_range(self):
+ vals = np.random.random((100, 3))
+ histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]])
+ assert_raises(ValueError, histogramdd, vals,
+ range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
+ assert_raises(ValueError, histogramdd, vals,
+ range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
+
+
+if __name__ == "__main__":
+ run_module_suite()