summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/release/1.15.0-notes.rst2
-rw-r--r--numpy/lib/histograms.py128
-rw-r--r--numpy/lib/tests/test_histograms.py16
3 files changed, 143 insertions, 3 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index 4d19a953d..fadce7684 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -22,6 +22,8 @@ New functions
... print(np.array([2.0])) / 3
[0.67]
+ * `np.histogram_bin_edges`, a function to get the edges of the bins used by a histogram
+ without needing to calculate the histogram.
Deprecations
============
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index cf40fcfe0..f151a6039 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -8,7 +8,7 @@ import operator
import numpy as np
from numpy.compat.py3k import basestring
-__all__ = ['histogram', 'histogramdd']
+__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']
def _hist_bin_sqrt(x):
@@ -348,6 +348,130 @@ def _search_sorted_inclusive(a, v):
))
+def histogram_bin_edges(a, bins=10, range=None, weights=None):
+ """
+ Function to calculate only the edges of the bins used by the `histogram` function.
+
+ Parameters
+ ----------
+ a : array_like
+ Input data. The histogram is computed over the flattened array.
+ bins : int or sequence of scalars or str, optional
+ If `bins` is an int, it defines the number of equal-width
+ bins in the given range (10, by default). If `bins` is a
+ sequence, it defines the bin edges, including the rightmost
+ edge, allowing for non-uniform bin widths.
+
+ If `bins` is a string from the list below, `histogram_bin_edges` will use
+ the method chosen to calculate the optimal bin width and
+ consequently the number of bins (see `Notes` for more detail on
+ the estimators) from the data that falls within the requested
+ range. While the bin width will be optimal for the actual data
+ in the range, the number of bins will be computed to fill the
+ entire range, including the empty portions. For visualisation,
+ using the 'auto' option is suggested. Weighted data is not
+ supported for automated bin size selection.
+
+ 'auto'
+ Maximum of the 'sturges' and 'fd' estimators. Provides good
+ all around performance.
+
+ 'fd' (Freedman Diaconis Estimator)
+ Robust (resilient to outliers) estimator that takes into
+ account data variability and data size.
+
+ 'doane'
+ An improved version of Sturges' estimator that works better
+ with non-normal datasets.
+
+ 'scott'
+ Less robust estimator that that takes into account data
+ variability and data size.
+
+ 'rice'
+ Estimator does not take variability into account, only data
+ size. Commonly overestimates number of bins required.
+
+ 'sturges'
+ R's default method, only accounts for data size. Only
+ optimal for gaussian data and underestimates number of bins
+ for large non-gaussian datasets.
+
+ 'sqrt'
+ Square root (of data size) estimator, used by Excel and
+ other programs for its speed and simplicity.
+
+ range : (float, float), optional
+ The lower and upper range of the bins. If not provided, range
+ is simply ``(a.min(), a.max())``. Values outside the range are
+ ignored. The first element of the range must be less than or
+ equal to the second. `range` affects the automatic bin
+ computation as well. While bin width is computed to be optimal
+ based on the actual data within `range`, the bin count will fill
+ the entire range including portions containing no data.
+
+ weights : array_like, optional
+ An array of weights, of the same shape as `a`. Each value in
+ `a` only contributes its associated weight towards the bin count
+ (instead of 1). This is currently not used by any of the bin estimators,
+ but may be in the future.
+
+ Returns
+ -------
+ bin_edges : array of dtype float
+ The edges to pass into `histogram`
+
+ See Also
+ --------
+ histogram
+
+ Examples
+ --------
+ >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
+ >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1))
+ array([0. , 0.25, 0.5 , 0.75, 1. ])
+ >>> np.histogram_bin_edges(arr, bins=2)
+ array([0. , 2.5, 5. ])
+
+ For consistency with histogram, an array of pre-computed bins is
+ passed through unmodified:
+
+ >>> np.histogram_bin_edges(arr, [1, 2])
+ array([1, 2])
+
+ This function allows one set of bins to be computed, and reused across
+ multiple histograms:
+
+ >>> shared_bins = np.histogram_bin_edges(arr, bins='auto')
+ >>> shared_bins
+ array([0., 1., 2., 3., 4., 5.])
+
+ >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
+ >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins)
+ >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins)
+
+ >>> hist_0; hist_1
+ array([1, 1, 0, 1, 0])
+ array([2, 0, 1, 1, 2])
+
+ Which gives more easily comparable results than using separate bins for
+ each histogram:
+
+ >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')
+ >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')
+ >>> hist_0; hist1
+ array([1, 1, 1])
+ array([2, 1, 1, 2])
+ >>> bins_0; bins_1
+ array([0., 1., 2., 3.])
+ array([0. , 1.25, 2.5 , 3.75, 5. ])
+
+ """
+ a, weights = _ravel_and_check_weights(a, weights)
+ bin_edges, _ = _get_bin_edges(a, bins, range, weights)
+ return bin_edges
+
+
def histogram(a, bins=10, range=None, normed=False, weights=None,
density=None):
r"""
@@ -448,7 +572,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
See Also
--------
- histogramdd, bincount, searchsorted, digitize
+ histogramdd, bincount, searchsorted, digitize, histogram_bin_edges
Notes
-----
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index a2c684a20..4f7af214c 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -2,7 +2,7 @@ from __future__ import division, absolute_import, print_function
import numpy as np
-from numpy.lib.histograms import histogram, histogramdd
+from numpy.lib.histograms import histogram, histogramdd, histogram_bin_edges
from numpy.testing import (
run_module_suite, assert_, assert_equal, assert_array_equal,
assert_almost_equal, assert_array_almost_equal, assert_raises,
@@ -346,6 +346,20 @@ class TestHistogram(object):
self.do_precision(np.single, np.longdouble)
self.do_precision(np.double, np.longdouble)
+ def test_histogram_bin_edges(self):
+ hist, e = histogram([1, 2, 3, 4], [1, 2])
+ edges = histogram_bin_edges([1, 2, 3, 4], [1, 2])
+ assert_array_equal(edges, e)
+
+ arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.])
+ hist, e = histogram(arr, bins=30, range=(-0.5, 5))
+ edges = histogram_bin_edges(arr, bins=30, range=(-0.5, 5))
+ assert_array_equal(edges, e)
+
+ hist, e = histogram(arr, bins='auto', range=(0, 1))
+ edges = histogram_bin_edges(arr, bins='auto', range=(0, 1))
+ assert_array_equal(edges, e)
+
class TestHistogramOptimBinNums(object):
"""