diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2017-12-22 10:07:04 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-22 10:07:04 -0700 |
commit | 0bcc57a4dbfd186b252dbdd73ae543da57fd81e4 (patch) | |
tree | e1375cf7acbed890358a9a1e619c5d58d7342824 | |
parent | 4655d5b96e16a8ac865a4ab61f049c40c30fbbc2 (diff) | |
parent | ccbe65d126c72444779b269f3e03deb464bc0eee (diff) | |
download | python-numpy-0bcc57a4dbfd186b252dbdd73ae543da57fd81e4.tar.gz python-numpy-0bcc57a4dbfd186b252dbdd73ae543da57fd81e4.tar.bz2 python-numpy-0bcc57a4dbfd186b252dbdd73ae543da57fd81e4.zip |
Merge pull request #10186 from eric-wieser/move_histogram
MAINT: Move histogram and histogramdd into their own module
-rw-r--r-- | doc/release/1.15.0-notes.rst | 9 | ||||
-rw-r--r-- | numpy/lib/__init__.py | 2 | ||||
-rw-r--r-- | numpy/lib/function_base.py | 804 | ||||
-rw-r--r-- | numpy/lib/histograms.py | 811 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 512 | ||||
-rw-r--r-- | numpy/lib/tests/test_histograms.py | 527 |
6 files changed, 1352 insertions, 1313 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst index 8f2de1fda..a01f897ef 100644 --- a/doc/release/1.15.0-notes.rst +++ b/doc/release/1.15.0-notes.rst @@ -55,6 +55,15 @@ builtin arbitrary-precision `Decimal` and `long` types. Improvements ============ +``histogram`` and ``histogramdd` functions have moved to ``np.lib.histograms`` +------------------------------------------------------------------------------ +These were originally found in ``np.lib.function_base``. They are still +available under their un-scoped ``np.histogram(dd)`` names, and +to maintain compatibility, aliased at ``np.lib.function_base.histogram(dd)``. + +Code that does ``from np.lib.function_base import *`` will need to be updated +with the new location, and should consider not using ``import *`` in future. + ``MaskedArray.astype`` now is identical to ``ndarray.astype`` ------------------------------------------------------------- This means it takes all the same arguments, making more code written for diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py index d85a179dd..cc05232a2 100644 --- a/numpy/lib/__init__.py +++ b/numpy/lib/__init__.py @@ -14,6 +14,7 @@ from .shape_base import * from .stride_tricks import * from .twodim_base import * from .ufunclike import * +from .histograms import * from . import scimath as emath from .polynomial import * @@ -43,6 +44,7 @@ __all__ += arraysetops.__all__ __all__ += npyio.__all__ __all__ += financial.__all__ __all__ += nanfunctions.__all__ +__all__ += histograms.__all__ from numpy.testing import _numpy_tester test = _numpy_tester().test diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 780cf3c5c..e01333b8b 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -39,12 +39,14 @@ if sys.version_info[0] < 3: else: import builtins +# needed in this module for compatibility +from numpy.lib.histograms import histogram, histogramdd __all__ = [ 'select', 'piecewise', 'trim_zeros', 'copy', 'iterable', 'percentile', 'diff', 'gradient', 'angle', 'unwrap', 'sort_complex', 'disp', 'flip', 'rot90', 'extract', 'place', 'vectorize', 'asarray_chkfinite', 'average', - 'histogram', 'histogramdd', 'bincount', 'digitize', 'cov', 'corrcoef', + 'bincount', 'digitize', 'cov', 'corrcoef', 'msort', 'median', 'sinc', 'hamming', 'hanning', 'bartlett', 'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring', 'meshgrid', 'delete', 'insert', 'append', 'interp', 'add_newdoc_ufunc' @@ -241,806 +243,6 @@ def iterable(y): return True -def _hist_bin_sqrt(x): - """ - Square root histogram bin estimator. - - Bin width is inversely proportional to the data size. Used by many - programs for its simplicity. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / np.sqrt(x.size) - - -def _hist_bin_sturges(x): - """ - Sturges histogram bin estimator. - - A very simplistic estimator based on the assumption of normality of - the data. This estimator has poor performance for non-normal data, - which becomes especially obvious for large data sets. The estimate - depends only on size of the data. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / (np.log2(x.size) + 1.0) - - -def _hist_bin_rice(x): - """ - Rice histogram bin estimator. - - Another simple estimator with no normality assumption. It has better - performance for large data than Sturges, but tends to overestimate - the number of bins. The number of bins is proportional to the cube - root of data size (asymptotically optimal). The estimate depends - only on size of the data. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / (2.0 * x.size ** (1.0 / 3)) - - -def _hist_bin_scott(x): - """ - Scott histogram bin estimator. - - The binwidth is proportional to the standard deviation of the data - and inversely proportional to the cube root of data size - (asymptotically optimal). - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) - - -def _hist_bin_doane(x): - """ - Doane's histogram bin estimator. - - Improved version of Sturges' formula which works better for - non-normal data. See - stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - if x.size > 2: - sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) - sigma = np.std(x) - if sigma > 0.0: - # These three operations add up to - # g1 = np.mean(((x - np.mean(x)) / sigma)**3) - # but use only one temp array instead of three - temp = x - np.mean(x) - np.true_divide(temp, sigma, temp) - np.power(temp, 3, temp) - g1 = np.mean(temp) - return x.ptp() / (1.0 + np.log2(x.size) + - np.log2(1.0 + np.absolute(g1) / sg1)) - return 0.0 - - -def _hist_bin_fd(x): - """ - The Freedman-Diaconis histogram bin estimator. - - The Freedman-Diaconis rule uses interquartile range (IQR) to - estimate binwidth. It is considered a variation of the Scott rule - with more robustness as the IQR is less affected by outliers than - the standard deviation. However, the IQR depends on fewer points - than the standard deviation, so it is less accurate, especially for - long tailed distributions. - - If the IQR is 0, this function returns 1 for the number of bins. - Binwidth is inversely proportional to the cube root of data size - (asymptotically optimal). - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - iqr = np.subtract(*np.percentile(x, [75, 25])) - return 2.0 * iqr * x.size ** (-1.0 / 3.0) - - -def _hist_bin_auto(x): - """ - Histogram bin estimator that uses the minimum width of the - Freedman-Diaconis and Sturges estimators. - - The FD estimator is usually the most robust method, but its width - estimate tends to be too large for small `x`. The Sturges estimator - is quite good for small (<1000) datasets and is the default in the R - language. This method gives good off the shelf behaviour. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - - See Also - -------- - _hist_bin_fd, _hist_bin_sturges - """ - # There is no need to check for zero here. If ptp is, so is IQR and - # vice versa. Either both are zero or neither one is. - return min(_hist_bin_fd(x), _hist_bin_sturges(x)) - - -# Private dict initialized at module load time -_hist_bin_selectors = {'auto': _hist_bin_auto, - 'doane': _hist_bin_doane, - 'fd': _hist_bin_fd, - 'rice': _hist_bin_rice, - 'scott': _hist_bin_scott, - 'sqrt': _hist_bin_sqrt, - 'sturges': _hist_bin_sturges} - - -def histogram(a, bins=10, range=None, normed=False, weights=None, - density=None): - r""" - Compute the histogram of a set of data. - - Parameters - ---------- - a : array_like - Input data. The histogram is computed over the flattened array. - bins : int or sequence of scalars or str, optional - If `bins` is an int, it defines the number of equal-width - bins in the given range (10, by default). If `bins` is a - sequence, it defines the bin edges, including the rightmost - edge, allowing for non-uniform bin widths. - - .. versionadded:: 1.11.0 - - If `bins` is a string from the list below, `histogram` will use - the method chosen to calculate the optimal bin width and - consequently the number of bins (see `Notes` for more detail on - the estimators) from the data that falls within the requested - range. While the bin width will be optimal for the actual data - in the range, the number of bins will be computed to fill the - entire range, including the empty portions. For visualisation, - using the 'auto' option is suggested. Weighted data is not - supported for automated bin size selection. - - 'auto' - Maximum of the 'sturges' and 'fd' estimators. Provides good - all around performance. - - 'fd' (Freedman Diaconis Estimator) - Robust (resilient to outliers) estimator that takes into - account data variability and data size. - - 'doane' - An improved version of Sturges' estimator that works better - with non-normal datasets. - - 'scott' - Less robust estimator that that takes into account data - variability and data size. - - 'rice' - Estimator does not take variability into account, only data - size. Commonly overestimates number of bins required. - - 'sturges' - R's default method, only accounts for data size. Only - optimal for gaussian data and underestimates number of bins - for large non-gaussian datasets. - - 'sqrt' - Square root (of data size) estimator, used by Excel and - other programs for its speed and simplicity. - - range : (float, float), optional - The lower and upper range of the bins. If not provided, range - is simply ``(a.min(), a.max())``. Values outside the range are - ignored. The first element of the range must be less than or - equal to the second. `range` affects the automatic bin - computation as well. While bin width is computed to be optimal - based on the actual data within `range`, the bin count will fill - the entire range including portions containing no data. - normed : bool, optional - This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy - behavior. It will be removed in NumPy 2.0.0. Use the ``density`` - keyword instead. If ``False``, the result will contain the - number of samples in each bin. If ``True``, the result is the - value of the probability *density* function at the bin, - normalized such that the *integral* over the range is 1. Note - that this latter behavior is known to be buggy with unequal bin - widths; use ``density`` instead. - weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in - `a` only contributes its associated weight towards the bin count - (instead of 1). If `density` is True, the weights are - normalized, so that the integral of the density over the range - remains 1. - density : bool, optional - If ``False``, the result will contain the number of samples in - each bin. If ``True``, the result is the value of the - probability *density* function at the bin, normalized such that - the *integral* over the range is 1. Note that the sum of the - histogram values will not be equal to 1 unless bins of unity - width are chosen; it is not a probability *mass* function. - - Overrides the ``normed`` keyword if given. - - Returns - ------- - hist : array - The values of the histogram. See `density` and `weights` for a - description of the possible semantics. - bin_edges : array of dtype float - Return the bin edges ``(length(hist)+1)``. - - - See Also - -------- - histogramdd, bincount, searchsorted, digitize - - Notes - ----- - All but the last (righthand-most) bin is half-open. In other words, - if `bins` is:: - - [1, 2, 3, 4] - - then the first bin is ``[1, 2)`` (including 1, but excluding 2) and - the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which - *includes* 4. - - .. versionadded:: 1.11.0 - - The methods to estimate the optimal number of bins are well founded - in literature, and are inspired by the choices R provides for - histogram visualisation. Note that having the number of bins - proportional to :math:`n^{1/3}` is asymptotically optimal, which is - why it appears in most estimators. These are simply plug-in methods - that give good starting points for number of bins. In the equations - below, :math:`h` is the binwidth and :math:`n_h` is the number of - bins. All estimators that compute bin counts are recast to bin width - using the `ptp` of the data. The final bin count is obtained from - ``np.round(np.ceil(range / h))`. - - 'Auto' (maximum of the 'Sturges' and 'FD' estimators) - A compromise to get a good value. For small datasets the Sturges - value will usually be chosen, while larger datasets will usually - default to FD. Avoids the overly conservative behaviour of FD - and Sturges for small and large datasets respectively. - Switchover point is usually :math:`a.size \approx 1000`. - - 'FD' (Freedman Diaconis Estimator) - .. math:: h = 2 \frac{IQR}{n^{1/3}} - - The binwidth is proportional to the interquartile range (IQR) - and inversely proportional to cube root of a.size. Can be too - conservative for small datasets, but is quite good for large - datasets. The IQR is very robust to outliers. - - 'Scott' - .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} - - The binwidth is proportional to the standard deviation of the - data and inversely proportional to cube root of ``x.size``. Can - be too conservative for small datasets, but is quite good for - large datasets. The standard deviation is not very robust to - outliers. Values are very similar to the Freedman-Diaconis - estimator in the absence of outliers. - - 'Rice' - .. math:: n_h = 2n^{1/3} - - The number of bins is only proportional to cube root of - ``a.size``. It tends to overestimate the number of bins and it - does not take into account data variability. - - 'Sturges' - .. math:: n_h = \log _{2}n+1 - - The number of bins is the base 2 log of ``a.size``. This - estimator assumes normality of data and is too conservative for - larger, non-normal datasets. This is the default method in R's - ``hist`` method. - - 'Doane' - .. math:: n_h = 1 + \log_{2}(n) + - \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) - - g_1 = mean[(\frac{x - \mu}{\sigma})^3] - - \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} - - An improved version of Sturges' formula that produces better - estimates for non-normal datasets. This estimator attempts to - account for the skew of the data. - - 'Sqrt' - .. math:: n_h = \sqrt n - The simplest and fastest estimator. Only takes into account the - data size. - - Examples - -------- - >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) - (array([0, 2, 1]), array([0, 1, 2, 3])) - >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) - (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) - >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) - (array([1, 4, 1]), array([0, 1, 2, 3])) - - >>> a = np.arange(5) - >>> hist, bin_edges = np.histogram(a, density=True) - >>> hist - array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) - >>> hist.sum() - 2.4999999999999996 - >>> np.sum(hist * np.diff(bin_edges)) - 1.0 - - .. versionadded:: 1.11.0 - - Automated Bin Selection Methods example, using 2 peak random data - with 2000 points: - - >>> import matplotlib.pyplot as plt - >>> rng = np.random.RandomState(10) # deterministic random data - >>> a = np.hstack((rng.normal(size=1000), - ... rng.normal(loc=5, scale=2, size=1000))) - >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram - >>> plt.title("Histogram with 'auto' bins") - >>> plt.show() - - """ - a = asarray(a) - if weights is not None: - weights = asarray(weights) - if weights.shape != a.shape: - raise ValueError( - 'weights should have the same shape as a.') - weights = weights.ravel() - a = a.ravel() - - # Do not modify the original value of range so we can check for `None` - if range is None: - if a.size == 0: - # handle empty arrays. Can't determine range, so use 0-1. - first_edge, last_edge = 0.0, 1.0 - else: - first_edge, last_edge = a.min() + 0.0, a.max() + 0.0 - else: - first_edge, last_edge = [mi + 0.0 for mi in range] - if first_edge > last_edge: - raise ValueError( - 'max must be larger than min in range parameter.') - if not np.all(np.isfinite([first_edge, last_edge])): - raise ValueError( - 'range parameter must be finite.') - if first_edge == last_edge: - first_edge -= 0.5 - last_edge += 0.5 - - # density overrides the normed keyword - if density is not None: - normed = False - - # parse the overloaded bins argument - n_equal_bins = None - bin_edges = None - - if isinstance(bins, basestring): - bin_name = bins - # if `bins` is a string for an automatic method, - # this will replace it with the number of bins calculated - if bin_name not in _hist_bin_selectors: - raise ValueError( - "{!r} is not a valid estimator for `bins`".format(bin_name)) - if weights is not None: - raise TypeError("Automated estimation of the number of " - "bins is not supported for weighted data") - # Make a reference to `a` - b = a - # Update the reference if the range needs truncation - if range is not None: - keep = (a >= first_edge) - keep &= (a <= last_edge) - if not np.logical_and.reduce(keep): - b = a[keep] - - if b.size == 0: - n_equal_bins = 1 - else: - # Do not call selectors on empty arrays - width = _hist_bin_selectors[bin_name](b) - if width: - n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) - else: - # Width can be zero for some estimators, e.g. FD when - # the IQR of the data is zero. - n_equal_bins = 1 - - elif np.ndim(bins) == 0: - try: - n_equal_bins = operator.index(bins) - except TypeError: - raise TypeError( - '`bins` must be an integer, a string, or an array') - if n_equal_bins < 1: - raise ValueError('`bins` must be positive, when an integer') - - elif np.ndim(bins) == 1: - bin_edges = np.asarray(bins) - if np.any(bin_edges[:-1] > bin_edges[1:]): - raise ValueError( - '`bins` must increase monotonically, when an array') - - else: - raise ValueError('`bins` must be 1d, when an array') - - del bins - - # compute the bins if only the count was specified - if n_equal_bins is not None: - bin_edges = linspace( - first_edge, last_edge, n_equal_bins + 1, endpoint=True) - - # Histogram is an integer or a float array depending on the weights. - if weights is None: - ntype = np.dtype(np.intp) - else: - ntype = weights.dtype - - # We set a block size, as this allows us to iterate over chunks when - # computing histograms, to minimize memory usage. - BLOCK = 65536 - - # The fast path uses bincount, but that only works for certain types - # of weight - simple_weights = ( - weights is None or - np.can_cast(weights.dtype, np.double) or - np.can_cast(weights.dtype, complex) - ) - - if n_equal_bins is not None and simple_weights: - # Fast algorithm for equal bins - # We now convert values of a to bin indices, under the assumption of - # equal bin widths (which is valid here). - - # Initialize empty histogram - n = np.zeros(n_equal_bins, ntype) - - # Pre-compute histogram scaling factor - norm = n_equal_bins / (last_edge - first_edge) - - # We iterate over blocks here for two reasons: the first is that for - # large arrays, it is actually faster (for example for a 10^8 array it - # is 2x as fast) and it results in a memory footprint 3x lower in the - # limit of large arrays. - for i in arange(0, len(a), BLOCK): - tmp_a = a[i:i+BLOCK] - if weights is None: - tmp_w = None - else: - tmp_w = weights[i:i + BLOCK] - - # Only include values in the right range - keep = (tmp_a >= first_edge) - keep &= (tmp_a <= last_edge) - if not np.logical_and.reduce(keep): - tmp_a = tmp_a[keep] - if tmp_w is not None: - tmp_w = tmp_w[keep] - tmp_a_data = tmp_a.astype(float) - tmp_a = tmp_a_data - first_edge - tmp_a *= norm - - # Compute the bin indices, and for values that lie exactly on - # last_edge we need to subtract one - indices = tmp_a.astype(np.intp) - indices[indices == n_equal_bins] -= 1 - - # The index computation is not guaranteed to give exactly - # consistent results within ~1 ULP of the bin edges. - decrement = tmp_a_data < bin_edges[indices] - indices[decrement] -= 1 - # The last bin includes the right edge. The other bins do not. - increment = ((tmp_a_data >= bin_edges[indices + 1]) - & (indices != n_equal_bins - 1)) - indices[increment] += 1 - - # We now compute the histogram using bincount - if ntype.kind == 'c': - n.real += np.bincount(indices, weights=tmp_w.real, - minlength=n_equal_bins) - n.imag += np.bincount(indices, weights=tmp_w.imag, - minlength=n_equal_bins) - else: - n += np.bincount(indices, weights=tmp_w, - minlength=n_equal_bins).astype(ntype) - else: - # Compute via cumulative histogram - cum_n = np.zeros(bin_edges.shape, ntype) - if weights is None: - for i in arange(0, len(a), BLOCK): - sa = sort(a[i:i+BLOCK]) - cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'), - sa.searchsorted(bin_edges[-1], 'right')] - else: - zero = array(0, dtype=ntype) - for i in arange(0, len(a), BLOCK): - tmp_a = a[i:i+BLOCK] - tmp_w = weights[i:i+BLOCK] - sorting_index = np.argsort(tmp_a) - sa = tmp_a[sorting_index] - sw = tmp_w[sorting_index] - cw = np.concatenate(([zero], sw.cumsum())) - bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'), - sa.searchsorted(bin_edges[-1], 'right')] - cum_n += cw[bin_index] - - n = np.diff(cum_n) - - if density: - db = array(np.diff(bin_edges), float) - return n/db/n.sum(), bin_edges - elif normed: - # deprecated, buggy behavior. Remove for NumPy 2.0.0 - db = array(np.diff(bin_edges), float) - return n/(n*db).sum(), bin_edges - else: - return n, bin_edges - - -def histogramdd(sample, bins=10, range=None, normed=False, weights=None): - """ - Compute the multidimensional histogram of some data. - - Parameters - ---------- - sample : array_like - The data to be histogrammed. It must be an (N,D) array or data - that can be converted to such. The rows of the resulting array - are the coordinates of points in a D dimensional polytope. - bins : sequence or int, optional - The bin specification: - - * A sequence of arrays describing the bin edges along each dimension. - * The number of bins for each dimension (nx, ny, ... =bins) - * The number of bins for all dimensions (nx=ny=...=bins). - - range : sequence, optional - A sequence of lower and upper bin edges to be used if the edges are - not given explicitly in `bins`. Defaults to the minimum and maximum - values along each dimension. - normed : bool, optional - If False, returns the number of samples in each bin. If True, - returns the bin density ``bin_count / sample_count / bin_volume``. - weights : (N,) array_like, optional - An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. - Weights are normalized to 1 if normed is True. If normed is False, - the values of the returned histogram are equal to the sum of the - weights belonging to the samples falling into each bin. - - Returns - ------- - H : ndarray - The multidimensional histogram of sample x. See normed and weights - for the different possible semantics. - edges : list - A list of D arrays describing the bin edges for each dimension. - - See Also - -------- - histogram: 1-D histogram - histogram2d: 2-D histogram - - Examples - -------- - >>> r = np.random.randn(100,3) - >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) - >>> H.shape, edges[0].size, edges[1].size, edges[2].size - ((5, 8, 4), 6, 9, 5) - - """ - - try: - # Sample is an ND-array. - N, D = sample.shape - except (AttributeError, ValueError): - # Sample is a sequence of 1D arrays. - sample = atleast_2d(sample).T - N, D = sample.shape - - nbin = empty(D, int) - edges = D*[None] - dedges = D*[None] - if weights is not None: - weights = asarray(weights) - - try: - M = len(bins) - if M != D: - raise ValueError( - 'The dimension of bins must be equal to the dimension of the ' - ' sample x.') - except TypeError: - # bins is an integer - bins = D*[bins] - - # Select range for each dimension - # Used only if number of bins is given. - if range is None: - # Handle empty input. Range can't be determined in that case, use 0-1. - if N == 0: - smin = zeros(D) - smax = ones(D) - else: - smin = atleast_1d(array(sample.min(0), float)) - smax = atleast_1d(array(sample.max(0), float)) - else: - if not np.all(np.isfinite(range)): - raise ValueError( - 'range parameter must be finite.') - smin = zeros(D) - smax = zeros(D) - for i in arange(D): - smin[i], smax[i] = range[i] - - # Make sure the bins have a finite width. - for i in arange(len(smin)): - if smin[i] == smax[i]: - smin[i] = smin[i] - .5 - smax[i] = smax[i] + .5 - - # avoid rounding issues for comparisons when dealing with inexact types - if np.issubdtype(sample.dtype, np.inexact): - edge_dt = sample.dtype - else: - edge_dt = float - # Create edge arrays - for i in arange(D): - if isscalar(bins[i]): - if bins[i] < 1: - raise ValueError( - "Element at index %s in `bins` should be a positive " - "integer." % i) - nbin[i] = bins[i] + 2 # +2 for outlier bins - edges[i] = linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) - else: - edges[i] = asarray(bins[i], edge_dt) - nbin[i] = len(edges[i]) + 1 # +1 for outlier bins - dedges[i] = diff(edges[i]) - if np.any(np.asarray(dedges[i]) <= 0): - raise ValueError( - "Found bin edge of size <= 0. Did you specify `bins` with" - "non-monotonic sequence?") - - nbin = asarray(nbin) - - # Handle empty input. - if N == 0: - return np.zeros(nbin-2), edges - - # Compute the bin number each sample falls into. - Ncount = {} - for i in arange(D): - Ncount[i] = digitize(sample[:, i], edges[i]) - - # Using digitize, values that fall on an edge are put in the right bin. - # For the rightmost bin, we want values equal to the right edge to be - # counted in the last bin, and not as an outlier. - for i in arange(D): - # Rounding precision - mindiff = dedges[i].min() - if not np.isinf(mindiff): - decimal = int(-log10(mindiff)) + 6 - # Find which points are on the rightmost edge. - not_smaller_than_edge = (sample[:, i] >= edges[i][-1]) - on_edge = (around(sample[:, i], decimal) == - around(edges[i][-1], decimal)) - # Shift these points one bin to the left. - Ncount[i][nonzero(on_edge & not_smaller_than_edge)[0]] -= 1 - - # Flattened histogram matrix (1D) - # Reshape is used so that overlarge arrays - # will raise an error. - hist = zeros(nbin, float).reshape(-1) - - # Compute the sample indices in the flattened histogram matrix. - ni = nbin.argsort() - xy = zeros(N, int) - for i in arange(0, D-1): - xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() - xy += Ncount[ni[-1]] - - # Compute the number of repetitions in xy and assign it to the - # flattened histmat. - if len(xy) == 0: - return zeros(nbin-2, int), edges - - flatcount = bincount(xy, weights) - a = arange(len(flatcount)) - hist[a] = flatcount - - # Shape into a proper matrix - hist = hist.reshape(sort(nbin)) - for i in arange(nbin.size): - j = ni.argsort()[i] - hist = hist.swapaxes(i, j) - ni[i], ni[j] = ni[j], ni[i] - - # Remove outliers (indices 0 and -1 for each dimension). - core = D*[slice(1, -1)] - hist = hist[core] - - # Normalize if normed is True - if normed: - s = hist.sum() - for i in arange(D): - shape = ones(D, int) - shape[i] = nbin[i] - 2 - hist = hist / dedges[i].reshape(shape) - hist /= s - - if (hist.shape != nbin - 2).any(): - raise RuntimeError( - "Internal Shape Error") - return hist, edges - - def average(a, axis=None, weights=None, returned=False): """ Compute the weighted average along the specified axis. diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py new file mode 100644 index 000000000..61bbab6eb --- /dev/null +++ b/numpy/lib/histograms.py @@ -0,0 +1,811 @@ +""" +Histogram-related functions +""" +from __future__ import division, absolute_import, print_function + +import operator + +import numpy as np +from numpy.compat.py3k import basestring + +__all__ = ['histogram', 'histogramdd'] + + +def _hist_bin_sqrt(x): + """ + Square root histogram bin estimator. + + Bin width is inversely proportional to the data size. Used by many + programs for its simplicity. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / np.sqrt(x.size) + + +def _hist_bin_sturges(x): + """ + Sturges histogram bin estimator. + + A very simplistic estimator based on the assumption of normality of + the data. This estimator has poor performance for non-normal data, + which becomes especially obvious for large data sets. The estimate + depends only on size of the data. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / (np.log2(x.size) + 1.0) + + +def _hist_bin_rice(x): + """ + Rice histogram bin estimator. + + Another simple estimator with no normality assumption. It has better + performance for large data than Sturges, but tends to overestimate + the number of bins. The number of bins is proportional to the cube + root of data size (asymptotically optimal). The estimate depends + only on size of the data. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / (2.0 * x.size ** (1.0 / 3)) + + +def _hist_bin_scott(x): + """ + Scott histogram bin estimator. + + The binwidth is proportional to the standard deviation of the data + and inversely proportional to the cube root of data size + (asymptotically optimal). + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) + + +def _hist_bin_doane(x): + """ + Doane's histogram bin estimator. + + Improved version of Sturges' formula which works better for + non-normal data. See + stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + if x.size > 2: + sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) + sigma = np.std(x) + if sigma > 0.0: + # These three operations add up to + # g1 = np.mean(((x - np.mean(x)) / sigma)**3) + # but use only one temp array instead of three + temp = x - np.mean(x) + np.true_divide(temp, sigma, temp) + np.power(temp, 3, temp) + g1 = np.mean(temp) + return x.ptp() / (1.0 + np.log2(x.size) + + np.log2(1.0 + np.absolute(g1) / sg1)) + return 0.0 + + +def _hist_bin_fd(x): + """ + The Freedman-Diaconis histogram bin estimator. + + The Freedman-Diaconis rule uses interquartile range (IQR) to + estimate binwidth. It is considered a variation of the Scott rule + with more robustness as the IQR is less affected by outliers than + the standard deviation. However, the IQR depends on fewer points + than the standard deviation, so it is less accurate, especially for + long tailed distributions. + + If the IQR is 0, this function returns 1 for the number of bins. + Binwidth is inversely proportional to the cube root of data size + (asymptotically optimal). + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + iqr = np.subtract(*np.percentile(x, [75, 25])) + return 2.0 * iqr * x.size ** (-1.0 / 3.0) + + +def _hist_bin_auto(x): + """ + Histogram bin estimator that uses the minimum width of the + Freedman-Diaconis and Sturges estimators. + + The FD estimator is usually the most robust method, but its width + estimate tends to be too large for small `x`. The Sturges estimator + is quite good for small (<1000) datasets and is the default in the R + language. This method gives good off the shelf behaviour. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + + See Also + -------- + _hist_bin_fd, _hist_bin_sturges + """ + # There is no need to check for zero here. If ptp is, so is IQR and + # vice versa. Either both are zero or neither one is. + return min(_hist_bin_fd(x), _hist_bin_sturges(x)) + + +# Private dict initialized at module load time +_hist_bin_selectors = {'auto': _hist_bin_auto, + 'doane': _hist_bin_doane, + 'fd': _hist_bin_fd, + 'rice': _hist_bin_rice, + 'scott': _hist_bin_scott, + 'sqrt': _hist_bin_sqrt, + 'sturges': _hist_bin_sturges} + + +def histogram(a, bins=10, range=None, normed=False, weights=None, + density=None): + r""" + Compute the histogram of a set of data. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened array. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines the bin edges, including the rightmost + edge, allowing for non-uniform bin widths. + + .. versionadded:: 1.11.0 + + If `bins` is a string from the list below, `histogram` will use + the method chosen to calculate the optimal bin width and + consequently the number of bins (see `Notes` for more detail on + the estimators) from the data that falls within the requested + range. While the bin width will be optimal for the actual data + in the range, the number of bins will be computed to fill the + entire range, including the empty portions. For visualisation, + using the 'auto' option is suggested. Weighted data is not + supported for automated bin size selection. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good + all around performance. + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into + account data variability and data size. + + 'doane' + An improved version of Sturges' estimator that works better + with non-normal datasets. + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'rice' + Estimator does not take variability into account, only data + size. Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only + optimal for gaussian data and underestimates number of bins + for large non-gaussian datasets. + + 'sqrt' + Square root (of data size) estimator, used by Excel and + other programs for its speed and simplicity. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + normed : bool, optional + This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy + behavior. It will be removed in NumPy 2.0.0. Use the ``density`` + keyword instead. If ``False``, the result will contain the + number of samples in each bin. If ``True``, the result is the + value of the probability *density* function at the bin, + normalized such that the *integral* over the range is 1. Note + that this latter behavior is known to be buggy with unequal bin + widths; use ``density`` instead. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unity + width are chosen; it is not a probability *mass* function. + + Overrides the ``normed`` keyword if given. + + Returns + ------- + hist : array + The values of the histogram. See `density` and `weights` for a + description of the possible semantics. + bin_edges : array of dtype float + Return the bin edges ``(length(hist)+1)``. + + + See Also + -------- + histogramdd, bincount, searchsorted, digitize + + Notes + ----- + All but the last (righthand-most) bin is half-open. In other words, + if `bins` is:: + + [1, 2, 3, 4] + + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and + the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which + *includes* 4. + + .. versionadded:: 1.11.0 + + The methods to estimate the optimal number of bins are well founded + in literature, and are inspired by the choices R provides for + histogram visualisation. Note that having the number of bins + proportional to :math:`n^{1/3}` is asymptotically optimal, which is + why it appears in most estimators. These are simply plug-in methods + that give good starting points for number of bins. In the equations + below, :math:`h` is the binwidth and :math:`n_h` is the number of + bins. All estimators that compute bin counts are recast to bin width + using the `ptp` of the data. The final bin count is obtained from + ``np.round(np.ceil(range / h))`. + + 'Auto' (maximum of the 'Sturges' and 'FD' estimators) + A compromise to get a good value. For small datasets the Sturges + value will usually be chosen, while larger datasets will usually + default to FD. Avoids the overly conservative behaviour of FD + and Sturges for small and large datasets respectively. + Switchover point is usually :math:`a.size \approx 1000`. + + 'FD' (Freedman Diaconis Estimator) + .. math:: h = 2 \frac{IQR}{n^{1/3}} + + The binwidth is proportional to the interquartile range (IQR) + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good for large + datasets. The IQR is very robust to outliers. + + 'Scott' + .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} + + The binwidth is proportional to the standard deviation of the + data and inversely proportional to cube root of ``x.size``. Can + be too conservative for small datasets, but is quite good for + large datasets. The standard deviation is not very robust to + outliers. Values are very similar to the Freedman-Diaconis + estimator in the absence of outliers. + + 'Rice' + .. math:: n_h = 2n^{1/3} + + The number of bins is only proportional to cube root of + ``a.size``. It tends to overestimate the number of bins and it + does not take into account data variability. + + 'Sturges' + .. math:: n_h = \log _{2}n+1 + + The number of bins is the base 2 log of ``a.size``. This + estimator assumes normality of data and is too conservative for + larger, non-normal datasets. This is the default method in R's + ``hist`` method. + + 'Doane' + .. math:: n_h = 1 + \log_{2}(n) + + \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) + + g_1 = mean[(\frac{x - \mu}{\sigma})^3] + + \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} + + An improved version of Sturges' formula that produces better + estimates for non-normal datasets. This estimator attempts to + account for the skew of the data. + + 'Sqrt' + .. math:: n_h = \sqrt n + The simplest and fastest estimator. Only takes into account the + data size. + + Examples + -------- + >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) + (array([0, 2, 1]), array([0, 1, 2, 3])) + >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) + (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) + >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) + (array([1, 4, 1]), array([0, 1, 2, 3])) + + >>> a = np.arange(5) + >>> hist, bin_edges = np.histogram(a, density=True) + >>> hist + array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) + >>> hist.sum() + 2.4999999999999996 + >>> np.sum(hist * np.diff(bin_edges)) + 1.0 + + .. versionadded:: 1.11.0 + + Automated Bin Selection Methods example, using 2 peak random data + with 2000 points: + + >>> import matplotlib.pyplot as plt + >>> rng = np.random.RandomState(10) # deterministic random data + >>> a = np.hstack((rng.normal(size=1000), + ... rng.normal(loc=5, scale=2, size=1000))) + >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram + >>> plt.title("Histogram with 'auto' bins") + >>> plt.show() + + """ + a = np.asarray(a) + if weights is not None: + weights = np.asarray(weights) + if weights.shape != a.shape: + raise ValueError( + 'weights should have the same shape as a.') + weights = weights.ravel() + a = a.ravel() + + # Do not modify the original value of range so we can check for `None` + if range is None: + if a.size == 0: + # handle empty arrays. Can't determine range, so use 0-1. + first_edge, last_edge = 0.0, 1.0 + else: + first_edge, last_edge = a.min() + 0.0, a.max() + 0.0 + else: + first_edge, last_edge = [mi + 0.0 for mi in range] + if first_edge > last_edge: + raise ValueError( + 'max must be larger than min in range parameter.') + if not np.all(np.isfinite([first_edge, last_edge])): + raise ValueError( + 'range parameter must be finite.') + if first_edge == last_edge: + first_edge -= 0.5 + last_edge += 0.5 + + # density overrides the normed keyword + if density is not None: + normed = False + + # parse the overloaded bins argument + n_equal_bins = None + bin_edges = None + + if isinstance(bins, basestring): + bin_name = bins + # if `bins` is a string for an automatic method, + # this will replace it with the number of bins calculated + if bin_name not in _hist_bin_selectors: + raise ValueError( + "{!r} is not a valid estimator for `bins`".format(bin_name)) + if weights is not None: + raise TypeError("Automated estimation of the number of " + "bins is not supported for weighted data") + # Make a reference to `a` + b = a + # Update the reference if the range needs truncation + if range is not None: + keep = (a >= first_edge) + keep &= (a <= last_edge) + if not np.logical_and.reduce(keep): + b = a[keep] + + if b.size == 0: + n_equal_bins = 1 + else: + # Do not call selectors on empty arrays + width = _hist_bin_selectors[bin_name](b) + if width: + n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) + else: + # Width can be zero for some estimators, e.g. FD when + # the IQR of the data is zero. + n_equal_bins = 1 + + elif np.ndim(bins) == 0: + try: + n_equal_bins = operator.index(bins) + except TypeError: + raise TypeError( + '`bins` must be an integer, a string, or an array') + if n_equal_bins < 1: + raise ValueError('`bins` must be positive, when an integer') + + elif np.ndim(bins) == 1: + bin_edges = np.asarray(bins) + if np.any(bin_edges[:-1] > bin_edges[1:]): + raise ValueError( + '`bins` must increase monotonically, when an array') + + else: + raise ValueError('`bins` must be 1d, when an array') + + del bins + + # compute the bins if only the count was specified + if n_equal_bins is not None: + bin_edges = np.linspace( + first_edge, last_edge, n_equal_bins + 1, endpoint=True) + + # Histogram is an integer or a float array depending on the weights. + if weights is None: + ntype = np.dtype(np.intp) + else: + ntype = weights.dtype + + # We set a block size, as this allows us to iterate over chunks when + # computing histograms, to minimize memory usage. + BLOCK = 65536 + + # The fast path uses bincount, but that only works for certain types + # of weight + simple_weights = ( + weights is None or + np.can_cast(weights.dtype, np.double) or + np.can_cast(weights.dtype, complex) + ) + + if n_equal_bins is not None and simple_weights: + # Fast algorithm for equal bins + # We now convert values of a to bin indices, under the assumption of + # equal bin widths (which is valid here). + + # Initialize empty histogram + n = np.zeros(n_equal_bins, ntype) + + # Pre-compute histogram scaling factor + norm = n_equal_bins / (last_edge - first_edge) + + # We iterate over blocks here for two reasons: the first is that for + # large arrays, it is actually faster (for example for a 10^8 array it + # is 2x as fast) and it results in a memory footprint 3x lower in the + # limit of large arrays. + for i in np.arange(0, len(a), BLOCK): + tmp_a = a[i:i+BLOCK] + if weights is None: + tmp_w = None + else: + tmp_w = weights[i:i + BLOCK] + + # Only include values in the right range + keep = (tmp_a >= first_edge) + keep &= (tmp_a <= last_edge) + if not np.logical_and.reduce(keep): + tmp_a = tmp_a[keep] + if tmp_w is not None: + tmp_w = tmp_w[keep] + tmp_a_data = tmp_a.astype(float) + tmp_a = tmp_a_data - first_edge + tmp_a *= norm + + # Compute the bin indices, and for values that lie exactly on + # last_edge we need to subtract one + indices = tmp_a.astype(np.intp) + indices[indices == n_equal_bins] -= 1 + + # The index computation is not guaranteed to give exactly + # consistent results within ~1 ULP of the bin edges. + decrement = tmp_a_data < bin_edges[indices] + indices[decrement] -= 1 + # The last bin includes the right edge. The other bins do not. + increment = ((tmp_a_data >= bin_edges[indices + 1]) + & (indices != n_equal_bins - 1)) + indices[increment] += 1 + + # We now compute the histogram using bincount + if ntype.kind == 'c': + n.real += np.bincount(indices, weights=tmp_w.real, + minlength=n_equal_bins) + n.imag += np.bincount(indices, weights=tmp_w.imag, + minlength=n_equal_bins) + else: + n += np.bincount(indices, weights=tmp_w, + minlength=n_equal_bins).astype(ntype) + else: + # Compute via cumulative histogram + cum_n = np.zeros(bin_edges.shape, ntype) + if weights is None: + for i in np.arange(0, len(a), BLOCK): + sa = np.sort(a[i:i+BLOCK]) + cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'), + sa.searchsorted(bin_edges[-1], 'right')] + else: + zero = np.array(0, dtype=ntype) + for i in np.arange(0, len(a), BLOCK): + tmp_a = a[i:i+BLOCK] + tmp_w = weights[i:i+BLOCK] + sorting_index = np.argsort(tmp_a) + sa = tmp_a[sorting_index] + sw = tmp_w[sorting_index] + cw = np.concatenate(([zero], sw.cumsum())) + bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'), + sa.searchsorted(bin_edges[-1], 'right')] + cum_n += cw[bin_index] + + n = np.diff(cum_n) + + if density: + db = np.array(np.diff(bin_edges), float) + return n/db/n.sum(), bin_edges + elif normed: + # deprecated, buggy behavior. Remove for NumPy 2.0.0 + db = np.array(np.diff(bin_edges), float) + return n/(n*db).sum(), bin_edges + else: + return n, bin_edges + + +def histogramdd(sample, bins=10, range=None, normed=False, weights=None): + """ + Compute the multidimensional histogram of some data. + + Parameters + ---------- + sample : array_like + The data to be histogrammed. It must be an (N,D) array or data + that can be converted to such. The rows of the resulting array + are the coordinates of points in a D dimensional polytope. + bins : sequence or int, optional + The bin specification: + + * A sequence of arrays describing the bin edges along each dimension. + * The number of bins for each dimension (nx, ny, ... =bins) + * The number of bins for all dimensions (nx=ny=...=bins). + + range : sequence, optional + A sequence of lower and upper bin edges to be used if the edges are + not given explicitly in `bins`. Defaults to the minimum and maximum + values along each dimension. + normed : bool, optional + If False, returns the number of samples in each bin. If True, + returns the bin density ``bin_count / sample_count / bin_volume``. + weights : (N,) array_like, optional + An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. + Weights are normalized to 1 if normed is True. If normed is False, + the values of the returned histogram are equal to the sum of the + weights belonging to the samples falling into each bin. + + Returns + ------- + H : ndarray + The multidimensional histogram of sample x. See normed and weights + for the different possible semantics. + edges : list + A list of D arrays describing the bin edges for each dimension. + + See Also + -------- + histogram: 1-D histogram + histogram2d: 2-D histogram + + Examples + -------- + >>> r = np.random.randn(100,3) + >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) + >>> H.shape, edges[0].size, edges[1].size, edges[2].size + ((5, 8, 4), 6, 9, 5) + + """ + + try: + # Sample is an ND-array. + N, D = sample.shape + except (AttributeError, ValueError): + # Sample is a sequence of 1D arrays. + sample = np.atleast_2d(sample).T + N, D = sample.shape + + nbin = np.empty(D, int) + edges = D*[None] + dedges = D*[None] + if weights is not None: + weights = np.asarray(weights) + + try: + M = len(bins) + if M != D: + raise ValueError( + 'The dimension of bins must be equal to the dimension of the ' + ' sample x.') + except TypeError: + # bins is an integer + bins = D*[bins] + + # Select range for each dimension + # Used only if number of bins is given. + if range is None: + # Handle empty input. Range can't be determined in that case, use 0-1. + if N == 0: + smin = np.zeros(D) + smax = np.ones(D) + else: + smin = np.atleast_1d(np.array(sample.min(0), float)) + smax = np.atleast_1d(np.array(sample.max(0), float)) + else: + if not np.all(np.isfinite(range)): + raise ValueError( + 'range parameter must be finite.') + smin = np.zeros(D) + smax = np.zeros(D) + for i in np.arange(D): + smin[i], smax[i] = range[i] + + # Make sure the bins have a finite width. + for i in np.arange(len(smin)): + if smin[i] == smax[i]: + smin[i] = smin[i] - .5 + smax[i] = smax[i] + .5 + + # avoid rounding issues for comparisons when dealing with inexact types + if np.issubdtype(sample.dtype, np.inexact): + edge_dt = sample.dtype + else: + edge_dt = float + # Create edge arrays + for i in np.arange(D): + if np.isscalar(bins[i]): + if bins[i] < 1: + raise ValueError( + "Element at index %s in `bins` should be a positive " + "integer." % i) + nbin[i] = bins[i] + 2 # +2 for outlier bins + edges[i] = np.linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) + else: + edges[i] = np.asarray(bins[i], edge_dt) + nbin[i] = len(edges[i]) + 1 # +1 for outlier bins + dedges[i] = np.diff(edges[i]) + if np.any(np.asarray(dedges[i]) <= 0): + raise ValueError( + "Found bin edge of size <= 0. Did you specify `bins` with" + "non-monotonic sequence?") + + nbin = np.asarray(nbin) + + # Handle empty input. + if N == 0: + return np.zeros(nbin-2), edges + + # Compute the bin number each sample falls into. + Ncount = {} + for i in np.arange(D): + Ncount[i] = np.digitize(sample[:, i], edges[i]) + + # Using digitize, values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right edge to be + # counted in the last bin, and not as an outlier. + for i in np.arange(D): + # Rounding precision + mindiff = dedges[i].min() + if not np.isinf(mindiff): + decimal = int(-np.log10(mindiff)) + 6 + # Find which points are on the rightmost edge. + not_smaller_than_edge = (sample[:, i] >= edges[i][-1]) + on_edge = (np.around(sample[:, i], decimal) == + np.around(edges[i][-1], decimal)) + # Shift these points one bin to the left. + Ncount[i][np.nonzero(on_edge & not_smaller_than_edge)[0]] -= 1 + + # Flattened histogram matrix (1D) + # Reshape is used so that overlarge arrays + # will raise an error. + hist = np.zeros(nbin, float).reshape(-1) + + # Compute the sample indices in the flattened histogram matrix. + ni = nbin.argsort() + xy = np.zeros(N, int) + for i in np.arange(0, D-1): + xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() + xy += Ncount[ni[-1]] + + # Compute the number of repetitions in xy and assign it to the + # flattened histmat. + if len(xy) == 0: + return np.zeros(nbin-2, int), edges + + flatcount = np.bincount(xy, weights) + a = np.arange(len(flatcount)) + hist[a] = flatcount + + # Shape into a proper matrix + hist = hist.reshape(np.sort(nbin)) + for i in np.arange(nbin.size): + j = ni.argsort()[i] + hist = hist.swapaxes(i, j) + ni[i], ni[j] = ni[j], ni[i] + + # Remove outliers (indices 0 and -1 for each dimension). + core = D*[slice(1, -1)] + hist = hist[core] + + # Normalize if normed is True + if normed: + s = hist.sum() + for i in np.arange(D): + shape = np.ones(D, int) + shape[i] = nbin[i] - 2 + hist = hist / dedges[i].reshape(shape) + hist /= s + + if (hist.shape != nbin - 2).any(): + raise RuntimeError( + "Internal Shape Error") + return hist, edges diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 8381c2465..dbab69436 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1617,518 +1617,6 @@ class TestSinc(object): assert_array_equal(y1, y3) -class TestHistogram(object): - - def setup(self): - pass - - def teardown(self): - pass - - def test_simple(self): - n = 100 - v = rand(n) - (a, b) = histogram(v) - # check if the sum of the bins equals the number of samples - assert_equal(np.sum(a, axis=0), n) - # check that the bin counts are evenly spaced when the data is from - # a linear function - (a, b) = histogram(np.linspace(0, 10, 100)) - assert_array_equal(a, 10) - - def test_one_bin(self): - # Ticket 632 - hist, edges = histogram([1, 2, 3, 4], [1, 2]) - assert_array_equal(hist, [2, ]) - assert_array_equal(edges, [1, 2]) - assert_raises(ValueError, histogram, [1, 2], bins=0) - h, e = histogram([1, 2], bins=1) - assert_equal(h, np.array([2])) - assert_allclose(e, np.array([1., 2.])) - - def test_normed(self): - # Check that the integral of the density equals 1. - n = 100 - v = rand(n) - a, b = histogram(v, normed=True) - area = np.sum(a * diff(b)) - assert_almost_equal(area, 1) - - # Check with non-constant bin widths (buggy but backwards - # compatible) - v = np.arange(10) - bins = [0, 1, 5, 9, 10] - a, b = histogram(v, bins, normed=True) - area = np.sum(a * diff(b)) - assert_almost_equal(area, 1) - - def test_density(self): - # Check that the integral of the density equals 1. - n = 100 - v = rand(n) - a, b = histogram(v, density=True) - area = np.sum(a * diff(b)) - assert_almost_equal(area, 1) - - # Check with non-constant bin widths - v = np.arange(10) - bins = [0, 1, 3, 6, 10] - a, b = histogram(v, bins, density=True) - assert_array_equal(a, .1) - assert_equal(np.sum(a * diff(b)), 1) - - # Variale bin widths are especially useful to deal with - # infinities. - v = np.arange(10) - bins = [0, 1, 3, 6, np.inf] - a, b = histogram(v, bins, density=True) - assert_array_equal(a, [.1, .1, .1, 0.]) - - # Taken from a bug report from N. Becker on the numpy-discussion - # mailing list Aug. 6, 2010. - counts, dmy = np.histogram( - [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True) - assert_equal(counts, [.25, 0]) - - def test_outliers(self): - # Check that outliers are not tallied - a = np.arange(10) + .5 - - # Lower outliers - h, b = histogram(a, range=[0, 9]) - assert_equal(h.sum(), 9) - - # Upper outliers - h, b = histogram(a, range=[1, 10]) - assert_equal(h.sum(), 9) - - # Normalization - h, b = histogram(a, range=[1, 9], normed=True) - assert_almost_equal((h * diff(b)).sum(), 1, decimal=15) - - # Weights - w = np.arange(10) + .5 - h, b = histogram(a, range=[1, 9], weights=w, normed=True) - assert_equal((h * diff(b)).sum(), 1) - - h, b = histogram(a, bins=8, range=[1, 9], weights=w) - assert_equal(h, w[1:-1]) - - def test_type(self): - # Check the type of the returned histogram - a = np.arange(10) + .5 - h, b = histogram(a) - assert_(np.issubdtype(h.dtype, np.integer)) - - h, b = histogram(a, normed=True) - assert_(np.issubdtype(h.dtype, np.floating)) - - h, b = histogram(a, weights=np.ones(10, int)) - assert_(np.issubdtype(h.dtype, np.integer)) - - h, b = histogram(a, weights=np.ones(10, float)) - assert_(np.issubdtype(h.dtype, np.floating)) - - def test_f32_rounding(self): - # gh-4799, check that the rounding of the edges works with float32 - x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32) - y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32) - counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100) - assert_equal(counts_hist.sum(), 3.) - - def test_weights(self): - v = rand(100) - w = np.ones(100) * 5 - a, b = histogram(v) - na, nb = histogram(v, normed=True) - wa, wb = histogram(v, weights=w) - nwa, nwb = histogram(v, weights=w, normed=True) - assert_array_almost_equal(a * 5, wa) - assert_array_almost_equal(na, nwa) - - # Check weights are properly applied. - v = np.linspace(0, 10, 10) - w = np.concatenate((np.zeros(5), np.ones(5))) - wa, wb = histogram(v, bins=np.arange(11), weights=w) - assert_array_almost_equal(wa, w) - - # Check with integer weights - wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1]) - assert_array_equal(wa, [4, 5, 0, 1]) - wa, wb = histogram( - [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True) - assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4) - - # Check weights with non-uniform bin widths - a, b = histogram( - np.arange(9), [0, 1, 3, 6, 10], - weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True) - assert_almost_equal(a, [.2, .1, .1, .075]) - - def test_exotic_weights(self): - - # Test the use of weights that are not integer or floats, but e.g. - # complex numbers or object types. - - # Complex weights - values = np.array([1.3, 2.5, 2.3]) - weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2]) - - # Check with custom bins - wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) - assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) - - # Check with even bins - wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) - assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) - - # Decimal weights - from decimal import Decimal - values = np.array([1.3, 2.5, 2.3]) - weights = np.array([Decimal(1), Decimal(2), Decimal(3)]) - - # Check with custom bins - wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) - assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) - - # Check with even bins - wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) - assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) - - def test_no_side_effects(self): - # This is a regression test that ensures that values passed to - # ``histogram`` are unchanged. - values = np.array([1.3, 2.5, 2.3]) - np.histogram(values, range=[-10, 10], bins=100) - assert_array_almost_equal(values, [1.3, 2.5, 2.3]) - - def test_empty(self): - a, b = histogram([], bins=([0, 1])) - assert_array_equal(a, np.array([0])) - assert_array_equal(b, np.array([0, 1])) - - def test_error_binnum_type (self): - # Tests if right Error is raised if bins argument is float - vals = np.linspace(0.0, 1.0, num=100) - histogram(vals, 5) - assert_raises(TypeError, histogram, vals, 2.4) - - def test_finite_range(self): - # Normal ranges should be fine - vals = np.linspace(0.0, 1.0, num=100) - histogram(vals, range=[0.25,0.75]) - assert_raises(ValueError, histogram, vals, range=[np.nan,0.75]) - assert_raises(ValueError, histogram, vals, range=[0.25,np.inf]) - - def test_bin_edge_cases(self): - # Ensure that floating-point computations correctly place edge cases. - arr = np.array([337, 404, 739, 806, 1007, 1811, 2012]) - hist, edges = np.histogram(arr, bins=8296, range=(2, 2280)) - mask = hist > 0 - left_edges = edges[:-1][mask] - right_edges = edges[1:][mask] - for x, left, right in zip(arr, left_edges, right_edges): - assert_(x >= left) - assert_(x < right) - - def test_last_bin_inclusive_range(self): - arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.]) - hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5)) - assert_equal(hist[-1], 1) - - def test_unsigned_monotonicity_check(self): - # Ensures ValueError is raised if bins not increasing monotonically - # when bins contain unsigned values (see #9222) - arr = np.array([2]) - bins = np.array([1, 3, 1], dtype='uint64') - with assert_raises(ValueError): - hist, edges = np.histogram(arr, bins=bins) - - -class TestHistogramOptimBinNums(object): - """ - Provide test coverage when using provided estimators for optimal number of - bins - """ - - def test_empty(self): - estimator_list = ['fd', 'scott', 'rice', 'sturges', - 'doane', 'sqrt', 'auto'] - # check it can deal with empty data - for estimator in estimator_list: - a, b = histogram([], bins=estimator) - assert_array_equal(a, np.array([0])) - assert_array_equal(b, np.array([0, 1])) - - def test_simple(self): - """ - Straightforward testing with a mixture of linspace data (for - consistency). All test values have been precomputed and the values - shouldn't change - """ - # Some basic sanity checking, with some fixed data. - # Checking for the correct number of bins - basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, - 'doane': 8, 'sqrt': 8, 'auto': 7}, - 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, - 'doane': 12, 'sqrt': 23, 'auto': 10}, - 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, - 'doane': 17, 'sqrt': 71, 'auto': 17}} - - for testlen, expectedResults in basic_test.items(): - # Create some sort of non uniform data to test with - # (2 peak uniform mixture) - x1 = np.linspace(-10, -1, testlen // 5 * 2) - x2 = np.linspace(1, 10, testlen // 5 * 3) - x = np.concatenate((x1, x2)) - for estimator, numbins in expectedResults.items(): - a, b = np.histogram(x, estimator) - assert_equal(len(a), numbins, err_msg="For the {0} estimator " - "with datasize of {1}".format(estimator, testlen)) - - def test_small(self): - """ - Smaller datasets have the potential to cause issues with the data - adaptive methods, especially the FD method. All bin numbers have been - precalculated. - """ - small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, - 'doane': 1, 'sqrt': 1}, - 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, - 'doane': 1, 'sqrt': 2}, - 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, - 'doane': 3, 'sqrt': 2}} - - for testlen, expectedResults in small_dat.items(): - testdat = np.arange(testlen) - for estimator, expbins in expectedResults.items(): - a, b = np.histogram(testdat, estimator) - assert_equal(len(a), expbins, err_msg="For the {0} estimator " - "with datasize of {1}".format(estimator, testlen)) - - def test_incorrect_methods(self): - """ - Check a Value Error is thrown when an unknown string is passed in - """ - check_list = ['mad', 'freeman', 'histograms', 'IQR'] - for estimator in check_list: - assert_raises(ValueError, histogram, [1, 2, 3], estimator) - - def test_novariance(self): - """ - Check that methods handle no variance in data - Primarily for Scott and FD as the SD and IQR are both 0 in this case - """ - novar_dataset = np.ones(100) - novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, - 'doane': 1, 'sqrt': 1, 'auto': 1} - - for estimator, numbins in novar_resultdict.items(): - a, b = np.histogram(novar_dataset, estimator) - assert_equal(len(a), numbins, err_msg="{0} estimator, " - "No Variance test".format(estimator)) - - def test_outlier(self): - """ - Check the FD, Scott and Doane with outliers. - - The FD estimates a smaller binwidth since it's less affected by - outliers. Since the range is so (artificially) large, this means more - bins, most of which will be empty, but the data of interest usually is - unaffected. The Scott estimator is more affected and returns fewer bins, - despite most of the variance being in one area of the data. The Doane - estimator lies somewhere between the other two. - """ - xcenter = np.linspace(-10, 10, 50) - outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) - - outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} - - for estimator, numbins in outlier_resultdict.items(): - a, b = np.histogram(outlier_dataset, estimator) - assert_equal(len(a), numbins) - - def test_simple_range(self): - """ - Straightforward testing with a mixture of linspace data (for - consistency). Adding in a 3rd mixture that will then be - completely ignored. All test values have been precomputed and - the shouldn't change. - """ - # some basic sanity checking, with some fixed data. - # Checking for the correct number of bins - basic_test = { - 50: {'fd': 8, 'scott': 8, 'rice': 15, - 'sturges': 14, 'auto': 14}, - 500: {'fd': 15, 'scott': 16, 'rice': 32, - 'sturges': 20, 'auto': 20}, - 5000: {'fd': 33, 'scott': 33, 'rice': 69, - 'sturges': 27, 'auto': 33} - } - - for testlen, expectedResults in basic_test.items(): - # create some sort of non uniform data to test with - # (3 peak uniform mixture) - x1 = np.linspace(-10, -1, testlen // 5 * 2) - x2 = np.linspace(1, 10, testlen // 5 * 3) - x3 = np.linspace(-100, -50, testlen) - x = np.hstack((x1, x2, x3)) - for estimator, numbins in expectedResults.items(): - a, b = np.histogram(x, estimator, range = (-20, 20)) - msg = "For the {0} estimator".format(estimator) - msg += " with datasize of {0}".format(testlen) - assert_equal(len(a), numbins, err_msg=msg) - - def test_simple_weighted(self): - """ - Check that weighted data raises a TypeError - """ - estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] - for estimator in estimator_list: - assert_raises(TypeError, histogram, [1, 2, 3], - estimator, weights=[1, 2, 3]) - - -class TestHistogramdd(object): - - def test_simple(self): - x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], - [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]]) - H, edges = histogramdd(x, (2, 3, 3), - range=[[-1, 1], [0, 3], [0, 3]]) - answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]], - [[0, 1, 0], [0, 0, 1], [0, 0, 1]]]) - assert_array_equal(H, answer) - - # Check normalization - ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]] - H, edges = histogramdd(x, bins=ed, normed=True) - assert_(np.all(H == answer / 12.)) - - # Check that H has the correct shape. - H, edges = histogramdd(x, (2, 3, 4), - range=[[-1, 1], [0, 3], [0, 4]], - normed=True) - answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]], - [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]) - assert_array_almost_equal(H, answer / 6., 4) - # Check that a sequence of arrays is accepted and H has the correct - # shape. - z = [np.squeeze(y) for y in split(x, 3, axis=1)] - H, edges = histogramdd( - z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]]) - answer = np.array([[[0, 0], [0, 0], [0, 0]], - [[0, 1], [0, 0], [1, 0]], - [[0, 1], [0, 0], [0, 0]], - [[0, 0], [0, 0], [0, 0]]]) - assert_array_equal(H, answer) - - Z = np.zeros((5, 5, 5)) - Z[list(range(5)), list(range(5)), list(range(5))] = 1. - H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5) - assert_array_equal(H, Z) - - def test_shape_3d(self): - # All possible permutations for bins of different lengths in 3D. - bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4), - (4, 5, 6)) - r = rand(10, 3) - for b in bins: - H, edges = histogramdd(r, b) - assert_(H.shape == b) - - def test_shape_4d(self): - # All possible permutations for bins of different lengths in 4D. - bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4), - (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6), - (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7), - (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5), - (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5), - (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4)) - - r = rand(10, 4) - for b in bins: - H, edges = histogramdd(r, b) - assert_(H.shape == b) - - def test_weights(self): - v = rand(100, 2) - hist, edges = histogramdd(v) - n_hist, edges = histogramdd(v, normed=True) - w_hist, edges = histogramdd(v, weights=np.ones(100)) - assert_array_equal(w_hist, hist) - w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True) - assert_array_equal(w_hist, n_hist) - w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2) - assert_array_equal(w_hist, 2 * hist) - - def test_identical_samples(self): - x = np.zeros((10, 2), int) - hist, edges = histogramdd(x, bins=2) - assert_array_equal(edges[0], np.array([-0.5, 0., 0.5])) - - def test_empty(self): - a, b = histogramdd([[], []], bins=([0, 1], [0, 1])) - assert_array_max_ulp(a, np.array([[0.]])) - a, b = np.histogramdd([[], [], []], bins=2) - assert_array_max_ulp(a, np.zeros((2, 2, 2))) - - def test_bins_errors(self): - # There are two ways to specify bins. Check for the right errors - # when mixing those. - x = np.arange(8).reshape(2, 4) - assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5]) - assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1]) - assert_raises( - ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]]) - assert_raises( - ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]]) - assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]])) - - def test_inf_edges(self): - # Test using +/-inf bin edges works. See #1788. - with np.errstate(invalid='ignore'): - x = np.arange(6).reshape(3, 2) - expected = np.array([[1, 0], [0, 1], [0, 1]]) - h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]]) - assert_allclose(h, expected) - h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])]) - assert_allclose(h, expected) - h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]]) - assert_allclose(h, expected) - - def test_rightmost_binedge(self): - # Test event very close to rightmost binedge. See Github issue #4266 - x = [0.9999999995] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 1.) - x = [1.0] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 1.) - x = [1.0000000001] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 1.) - x = [1.0001] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 0.0) - - def test_finite_range(self): - vals = np.random.random((100, 3)) - histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]]) - assert_raises(ValueError, histogramdd, vals, - range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]]) - assert_raises(ValueError, histogramdd, vals, - range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]]) - - class TestUnique(object): def test_simple(self): diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py new file mode 100644 index 000000000..59baf91fe --- /dev/null +++ b/numpy/lib/tests/test_histograms.py @@ -0,0 +1,527 @@ +from __future__ import division, absolute_import, print_function + +import numpy as np + +from numpy.lib.histograms import histogram, histogramdd +from numpy.testing import ( + run_module_suite, assert_, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, assert_raises, + assert_allclose, assert_array_max_ulp, assert_warns, assert_raises_regex, + dec, suppress_warnings, HAS_REFCOUNT, +) + + +class TestHistogram(object): + + def setup(self): + pass + + def teardown(self): + pass + + def test_simple(self): + n = 100 + v = np.random.rand(n) + (a, b) = histogram(v) + # check if the sum of the bins equals the number of samples + assert_equal(np.sum(a, axis=0), n) + # check that the bin counts are evenly spaced when the data is from + # a linear function + (a, b) = histogram(np.linspace(0, 10, 100)) + assert_array_equal(a, 10) + + def test_one_bin(self): + # Ticket 632 + hist, edges = histogram([1, 2, 3, 4], [1, 2]) + assert_array_equal(hist, [2, ]) + assert_array_equal(edges, [1, 2]) + assert_raises(ValueError, histogram, [1, 2], bins=0) + h, e = histogram([1, 2], bins=1) + assert_equal(h, np.array([2])) + assert_allclose(e, np.array([1., 2.])) + + def test_normed(self): + # Check that the integral of the density equals 1. + n = 100 + v = np.random.rand(n) + a, b = histogram(v, normed=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + # Check with non-constant bin widths (buggy but backwards + # compatible) + v = np.arange(10) + bins = [0, 1, 5, 9, 10] + a, b = histogram(v, bins, normed=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + def test_density(self): + # Check that the integral of the density equals 1. + n = 100 + v = np.random.rand(n) + a, b = histogram(v, density=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + # Check with non-constant bin widths + v = np.arange(10) + bins = [0, 1, 3, 6, 10] + a, b = histogram(v, bins, density=True) + assert_array_equal(a, .1) + assert_equal(np.sum(a * np.diff(b)), 1) + + # Variale bin widths are especially useful to deal with + # infinities. + v = np.arange(10) + bins = [0, 1, 3, 6, np.inf] + a, b = histogram(v, bins, density=True) + assert_array_equal(a, [.1, .1, .1, 0.]) + + # Taken from a bug report from N. Becker on the numpy-discussion + # mailing list Aug. 6, 2010. + counts, dmy = np.histogram( + [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True) + assert_equal(counts, [.25, 0]) + + def test_outliers(self): + # Check that outliers are not tallied + a = np.arange(10) + .5 + + # Lower outliers + h, b = histogram(a, range=[0, 9]) + assert_equal(h.sum(), 9) + + # Upper outliers + h, b = histogram(a, range=[1, 10]) + assert_equal(h.sum(), 9) + + # Normalization + h, b = histogram(a, range=[1, 9], normed=True) + assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15) + + # Weights + w = np.arange(10) + .5 + h, b = histogram(a, range=[1, 9], weights=w, normed=True) + assert_equal((h * np.diff(b)).sum(), 1) + + h, b = histogram(a, bins=8, range=[1, 9], weights=w) + assert_equal(h, w[1:-1]) + + def test_type(self): + # Check the type of the returned histogram + a = np.arange(10) + .5 + h, b = histogram(a) + assert_(np.issubdtype(h.dtype, np.integer)) + + h, b = histogram(a, normed=True) + assert_(np.issubdtype(h.dtype, np.floating)) + + h, b = histogram(a, weights=np.ones(10, int)) + assert_(np.issubdtype(h.dtype, np.integer)) + + h, b = histogram(a, weights=np.ones(10, float)) + assert_(np.issubdtype(h.dtype, np.floating)) + + def test_f32_rounding(self): + # gh-4799, check that the rounding of the edges works with float32 + x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32) + y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32) + counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100) + assert_equal(counts_hist.sum(), 3.) + + def test_weights(self): + v = np.random.rand(100) + w = np.ones(100) * 5 + a, b = histogram(v) + na, nb = histogram(v, normed=True) + wa, wb = histogram(v, weights=w) + nwa, nwb = histogram(v, weights=w, normed=True) + assert_array_almost_equal(a * 5, wa) + assert_array_almost_equal(na, nwa) + + # Check weights are properly applied. + v = np.linspace(0, 10, 10) + w = np.concatenate((np.zeros(5), np.ones(5))) + wa, wb = histogram(v, bins=np.arange(11), weights=w) + assert_array_almost_equal(wa, w) + + # Check with integer weights + wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1]) + assert_array_equal(wa, [4, 5, 0, 1]) + wa, wb = histogram( + [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True) + assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4) + + # Check weights with non-uniform bin widths + a, b = histogram( + np.arange(9), [0, 1, 3, 6, 10], + weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True) + assert_almost_equal(a, [.2, .1, .1, .075]) + + def test_exotic_weights(self): + + # Test the use of weights that are not integer or floats, but e.g. + # complex numbers or object types. + + # Complex weights + values = np.array([1.3, 2.5, 2.3]) + weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2]) + + # Check with custom bins + wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) + assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) + + # Check with even bins + wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) + assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) + + # Decimal weights + from decimal import Decimal + values = np.array([1.3, 2.5, 2.3]) + weights = np.array([Decimal(1), Decimal(2), Decimal(3)]) + + # Check with custom bins + wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) + assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) + + # Check with even bins + wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) + assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) + + def test_no_side_effects(self): + # This is a regression test that ensures that values passed to + # ``histogram`` are unchanged. + values = np.array([1.3, 2.5, 2.3]) + np.histogram(values, range=[-10, 10], bins=100) + assert_array_almost_equal(values, [1.3, 2.5, 2.3]) + + def test_empty(self): + a, b = histogram([], bins=([0, 1])) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_error_binnum_type (self): + # Tests if right Error is raised if bins argument is float + vals = np.linspace(0.0, 1.0, num=100) + histogram(vals, 5) + assert_raises(TypeError, histogram, vals, 2.4) + + def test_finite_range(self): + # Normal ranges should be fine + vals = np.linspace(0.0, 1.0, num=100) + histogram(vals, range=[0.25,0.75]) + assert_raises(ValueError, histogram, vals, range=[np.nan,0.75]) + assert_raises(ValueError, histogram, vals, range=[0.25,np.inf]) + + def test_bin_edge_cases(self): + # Ensure that floating-point computations correctly place edge cases. + arr = np.array([337, 404, 739, 806, 1007, 1811, 2012]) + hist, edges = np.histogram(arr, bins=8296, range=(2, 2280)) + mask = hist > 0 + left_edges = edges[:-1][mask] + right_edges = edges[1:][mask] + for x, left, right in zip(arr, left_edges, right_edges): + assert_(x >= left) + assert_(x < right) + + def test_last_bin_inclusive_range(self): + arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.]) + hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5)) + assert_equal(hist[-1], 1) + + def test_unsigned_monotonicity_check(self): + # Ensures ValueError is raised if bins not increasing monotonically + # when bins contain unsigned values (see #9222) + arr = np.array([2]) + bins = np.array([1, 3, 1], dtype='uint64') + with assert_raises(ValueError): + hist, edges = np.histogram(arr, bins=bins) + + +class TestHistogramOptimBinNums(object): + """ + Provide test coverage when using provided estimators for optimal number of + bins + """ + + def test_empty(self): + estimator_list = ['fd', 'scott', 'rice', 'sturges', + 'doane', 'sqrt', 'auto'] + # check it can deal with empty data + for estimator in estimator_list: + a, b = histogram([], bins=estimator) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_simple(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). All test values have been precomputed and the values + shouldn't change + """ + # Some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, + 'doane': 8, 'sqrt': 8, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, + 'doane': 12, 'sqrt': 23, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, + 'doane': 17, 'sqrt': 71, 'auto': 17}} + + for testlen, expectedResults in basic_test.items(): + # Create some sort of non uniform data to test with + # (2 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x = np.concatenate((x1, x2)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator) + assert_equal(len(a), numbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) + + def test_small(self): + """ + Smaller datasets have the potential to cause issues with the data + adaptive methods, especially the FD method. All bin numbers have been + precalculated. + """ + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1}, + 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, + 'doane': 1, 'sqrt': 2}, + 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, + 'doane': 3, 'sqrt': 2}} + + for testlen, expectedResults in small_dat.items(): + testdat = np.arange(testlen) + for estimator, expbins in expectedResults.items(): + a, b = np.histogram(testdat, estimator) + assert_equal(len(a), expbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) + + def test_incorrect_methods(self): + """ + Check a Value Error is thrown when an unknown string is passed in + """ + check_list = ['mad', 'freeman', 'histograms', 'IQR'] + for estimator in check_list: + assert_raises(ValueError, histogram, [1, 2, 3], estimator) + + def test_novariance(self): + """ + Check that methods handle no variance in data + Primarily for Scott and FD as the SD and IQR are both 0 in this case + """ + novar_dataset = np.ones(100) + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1, 'auto': 1} + + for estimator, numbins in novar_resultdict.items(): + a, b = np.histogram(novar_dataset, estimator) + assert_equal(len(a), numbins, err_msg="{0} estimator, " + "No Variance test".format(estimator)) + + def test_outlier(self): + """ + Check the FD, Scott and Doane with outliers. + + The FD estimates a smaller binwidth since it's less affected by + outliers. Since the range is so (artificially) large, this means more + bins, most of which will be empty, but the data of interest usually is + unaffected. The Scott estimator is more affected and returns fewer bins, + despite most of the variance being in one area of the data. The Doane + estimator lies somewhere between the other two. + """ + xcenter = np.linspace(-10, 10, 50) + outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) + + outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} + + for estimator, numbins in outlier_resultdict.items(): + a, b = np.histogram(outlier_dataset, estimator) + assert_equal(len(a), numbins) + + def test_simple_range(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). Adding in a 3rd mixture that will then be + completely ignored. All test values have been precomputed and + the shouldn't change. + """ + # some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = { + 50: {'fd': 8, 'scott': 8, 'rice': 15, + 'sturges': 14, 'auto': 14}, + 500: {'fd': 15, 'scott': 16, 'rice': 32, + 'sturges': 20, 'auto': 20}, + 5000: {'fd': 33, 'scott': 33, 'rice': 69, + 'sturges': 27, 'auto': 33} + } + + for testlen, expectedResults in basic_test.items(): + # create some sort of non uniform data to test with + # (3 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x3 = np.linspace(-100, -50, testlen) + x = np.hstack((x1, x2, x3)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator, range = (-20, 20)) + msg = "For the {0} estimator".format(estimator) + msg += " with datasize of {0}".format(testlen) + assert_equal(len(a), numbins, err_msg=msg) + + def test_simple_weighted(self): + """ + Check that weighted data raises a TypeError + """ + estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + for estimator in estimator_list: + assert_raises(TypeError, histogram, [1, 2, 3], + estimator, weights=[1, 2, 3]) + + +class TestHistogramdd(object): + + def test_simple(self): + x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], + [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]]) + H, edges = histogramdd(x, (2, 3, 3), + range=[[-1, 1], [0, 3], [0, 3]]) + answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]], + [[0, 1, 0], [0, 0, 1], [0, 0, 1]]]) + assert_array_equal(H, answer) + + # Check normalization + ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]] + H, edges = histogramdd(x, bins=ed, normed=True) + assert_(np.all(H == answer / 12.)) + + # Check that H has the correct shape. + H, edges = histogramdd(x, (2, 3, 4), + range=[[-1, 1], [0, 3], [0, 4]], + normed=True) + answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]], + [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]) + assert_array_almost_equal(H, answer / 6., 4) + # Check that a sequence of arrays is accepted and H has the correct + # shape. + z = [np.squeeze(y) for y in np.split(x, 3, axis=1)] + H, edges = histogramdd( + z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]]) + answer = np.array([[[0, 0], [0, 0], [0, 0]], + [[0, 1], [0, 0], [1, 0]], + [[0, 1], [0, 0], [0, 0]], + [[0, 0], [0, 0], [0, 0]]]) + assert_array_equal(H, answer) + + Z = np.zeros((5, 5, 5)) + Z[list(range(5)), list(range(5)), list(range(5))] = 1. + H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5) + assert_array_equal(H, Z) + + def test_shape_3d(self): + # All possible permutations for bins of different lengths in 3D. + bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4), + (4, 5, 6)) + r = np.random.rand(10, 3) + for b in bins: + H, edges = histogramdd(r, b) + assert_(H.shape == b) + + def test_shape_4d(self): + # All possible permutations for bins of different lengths in 4D. + bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4), + (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6), + (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7), + (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5), + (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5), + (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4)) + + r = np.random.rand(10, 4) + for b in bins: + H, edges = histogramdd(r, b) + assert_(H.shape == b) + + def test_weights(self): + v = np.random.rand(100, 2) + hist, edges = histogramdd(v) + n_hist, edges = histogramdd(v, normed=True) + w_hist, edges = histogramdd(v, weights=np.ones(100)) + assert_array_equal(w_hist, hist) + w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True) + assert_array_equal(w_hist, n_hist) + w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2) + assert_array_equal(w_hist, 2 * hist) + + def test_identical_samples(self): + x = np.zeros((10, 2), int) + hist, edges = histogramdd(x, bins=2) + assert_array_equal(edges[0], np.array([-0.5, 0., 0.5])) + + def test_empty(self): + a, b = histogramdd([[], []], bins=([0, 1], [0, 1])) + assert_array_max_ulp(a, np.array([[0.]])) + a, b = np.histogramdd([[], [], []], bins=2) + assert_array_max_ulp(a, np.zeros((2, 2, 2))) + + def test_bins_errors(self): + # There are two ways to specify bins. Check for the right errors + # when mixing those. + x = np.arange(8).reshape(2, 4) + assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5]) + assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1]) + assert_raises( + ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]]) + assert_raises( + ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]]) + assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]])) + + def test_inf_edges(self): + # Test using +/-inf bin edges works. See #1788. + with np.errstate(invalid='ignore'): + x = np.arange(6).reshape(3, 2) + expected = np.array([[1, 0], [0, 1], [0, 1]]) + h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]]) + assert_allclose(h, expected) + h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])]) + assert_allclose(h, expected) + h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]]) + assert_allclose(h, expected) + + def test_rightmost_binedge(self): + # Test event very close to rightmost binedge. See Github issue #4266 + x = [0.9999999995] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0000000001] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0001] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 0.0) + + def test_finite_range(self): + vals = np.random.random((100, 3)) + histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]]) + assert_raises(ValueError, histogramdd, vals, + range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]]) + assert_raises(ValueError, histogramdd, vals, + range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]]) + + +if __name__ == "__main__": + run_module_suite() |