diff options
-rw-r--r-- | doc/release/1.15.0-notes.rst | 2 | ||||
-rw-r--r-- | numpy/lib/histograms.py | 128 | ||||
-rw-r--r-- | numpy/lib/tests/test_histograms.py | 16 |
3 files changed, 143 insertions, 3 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst index 4d19a953d..fadce7684 100644 --- a/doc/release/1.15.0-notes.rst +++ b/doc/release/1.15.0-notes.rst @@ -22,6 +22,8 @@ New functions ... print(np.array([2.0])) / 3 [0.67] + * `np.histogram_bin_edges`, a function to get the edges of the bins used by a histogram + without needing to calculate the histogram. Deprecations ============ diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index cf40fcfe0..f151a6039 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -8,7 +8,7 @@ import operator import numpy as np from numpy.compat.py3k import basestring -__all__ = ['histogram', 'histogramdd'] +__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges'] def _hist_bin_sqrt(x): @@ -348,6 +348,130 @@ def _search_sorted_inclusive(a, v): )) +def histogram_bin_edges(a, bins=10, range=None, weights=None): + """ + Function to calculate only the edges of the bins used by the `histogram` function. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened array. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines the bin edges, including the rightmost + edge, allowing for non-uniform bin widths. + + If `bins` is a string from the list below, `histogram_bin_edges` will use + the method chosen to calculate the optimal bin width and + consequently the number of bins (see `Notes` for more detail on + the estimators) from the data that falls within the requested + range. While the bin width will be optimal for the actual data + in the range, the number of bins will be computed to fill the + entire range, including the empty portions. For visualisation, + using the 'auto' option is suggested. Weighted data is not + supported for automated bin size selection. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good + all around performance. + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into + account data variability and data size. + + 'doane' + An improved version of Sturges' estimator that works better + with non-normal datasets. + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'rice' + Estimator does not take variability into account, only data + size. Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only + optimal for gaussian data and underestimates number of bins + for large non-gaussian datasets. + + 'sqrt' + Square root (of data size) estimator, used by Excel and + other programs for its speed and simplicity. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). This is currently not used by any of the bin estimators, + but may be in the future. + + Returns + ------- + bin_edges : array of dtype float + The edges to pass into `histogram` + + See Also + -------- + histogram + + Examples + -------- + >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5]) + >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1)) + array([0. , 0.25, 0.5 , 0.75, 1. ]) + >>> np.histogram_bin_edges(arr, bins=2) + array([0. , 2.5, 5. ]) + + For consistency with histogram, an array of pre-computed bins is + passed through unmodified: + + >>> np.histogram_bin_edges(arr, [1, 2]) + array([1, 2]) + + This function allows one set of bins to be computed, and reused across + multiple histograms: + + >>> shared_bins = np.histogram_bin_edges(arr, bins='auto') + >>> shared_bins + array([0., 1., 2., 3., 4., 5.]) + + >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1]) + >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins) + >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins) + + >>> hist_0; hist_1 + array([1, 1, 0, 1, 0]) + array([2, 0, 1, 1, 2]) + + Which gives more easily comparable results than using separate bins for + each histogram: + + >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto') + >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto') + >>> hist_0; hist1 + array([1, 1, 1]) + array([2, 1, 1, 2]) + >>> bins_0; bins_1 + array([0., 1., 2., 3.]) + array([0. , 1.25, 2.5 , 3.75, 5. ]) + + """ + a, weights = _ravel_and_check_weights(a, weights) + bin_edges, _ = _get_bin_edges(a, bins, range, weights) + return bin_edges + + def histogram(a, bins=10, range=None, normed=False, weights=None, density=None): r""" @@ -448,7 +572,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, See Also -------- - histogramdd, bincount, searchsorted, digitize + histogramdd, bincount, searchsorted, digitize, histogram_bin_edges Notes ----- diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py index a2c684a20..4f7af214c 100644 --- a/numpy/lib/tests/test_histograms.py +++ b/numpy/lib/tests/test_histograms.py @@ -2,7 +2,7 @@ from __future__ import division, absolute_import, print_function import numpy as np -from numpy.lib.histograms import histogram, histogramdd +from numpy.lib.histograms import histogram, histogramdd, histogram_bin_edges from numpy.testing import ( run_module_suite, assert_, assert_equal, assert_array_equal, assert_almost_equal, assert_array_almost_equal, assert_raises, @@ -346,6 +346,20 @@ class TestHistogram(object): self.do_precision(np.single, np.longdouble) self.do_precision(np.double, np.longdouble) + def test_histogram_bin_edges(self): + hist, e = histogram([1, 2, 3, 4], [1, 2]) + edges = histogram_bin_edges([1, 2, 3, 4], [1, 2]) + assert_array_equal(edges, e) + + arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.]) + hist, e = histogram(arr, bins=30, range=(-0.5, 5)) + edges = histogram_bin_edges(arr, bins=30, range=(-0.5, 5)) + assert_array_equal(edges, e) + + hist, e = histogram(arr, bins='auto', range=(0, 1)) + edges = histogram_bin_edges(arr, bins='auto', range=(0, 1)) + assert_array_equal(edges, e) + class TestHistogramOptimBinNums(object): """ |