summaryrefslogtreecommitdiff
path: root/libs/math/doc/sf_and_dist/html/math_toolkit/dist/dist_ref/dists/hypergeometric_dist.html
blob: 5ab8ced41f1a361b0ae32a13937e85d3da470fbb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=US-ASCII">
<title>Hypergeometric Distribution</title>
<link rel="stylesheet" href="../../../../../../../../../doc/src/boostbook.css" type="text/css">
<meta name="generator" content="DocBook XSL Stylesheets V1.76.1">
<link rel="home" href="../../../../index.html" title="Math Toolkit">
<link rel="up" href="../dists.html" title="Distributions">
<link rel="prev" href="geometric_dist.html" title="Geometric Distribution">
<link rel="next" href="inverse_chi_squared_dist.html" title="Inverse Chi Squared Distribution">
</head>
<body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF">
<table cellpadding="2" width="100%"><tr>
<td valign="top"><img alt="Boost C++ Libraries" width="277" height="86" src="../../../../../../../../../boost.png"></td>
<td align="center"><a href="../../../../../../../../../index.html">Home</a></td>
<td align="center"><a href="../../../../../../../../../libs/libraries.htm">Libraries</a></td>
<td align="center"><a href="http://www.boost.org/users/people.html">People</a></td>
<td align="center"><a href="http://www.boost.org/users/faq.html">FAQ</a></td>
<td align="center"><a href="../../../../../../../../../more/index.htm">More</a></td>
</tr></table>
<hr>
<div class="spirit-nav">
<a accesskey="p" href="geometric_dist.html"><img src="../../../../../../../../../doc/src/images/prev.png" alt="Prev"></a><a accesskey="u" href="../dists.html"><img src="../../../../../../../../../doc/src/images/up.png" alt="Up"></a><a accesskey="h" href="../../../../index.html"><img src="../../../../../../../../../doc/src/images/home.png" alt="Home"></a><a accesskey="n" href="inverse_chi_squared_dist.html"><img src="../../../../../../../../../doc/src/images/next.png" alt="Next"></a>
</div>
<div class="section math_toolkit_dist_dist_ref_dists_hypergeometric_dist">
<div class="titlepage"><div><div><h5 class="title">
<a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist"></a><a class="link" href="hypergeometric_dist.html" title="Hypergeometric Distribution">Hypergeometric
          Distribution</a>
</h5></div></div></div>
<p>
</p>
<pre class="programlisting"><span class="preprocessor">#include</span> <span class="special">&lt;</span><span class="identifier">boost</span><span class="special">/</span><span class="identifier">math</span><span class="special">/</span><span class="identifier">distributions</span><span class="special">/</span><span class="identifier">hypergeometric</span><span class="special">.</span><span class="identifier">hpp</span><span class="special">&gt;</span></pre>
<p>
          </p>
<pre class="programlisting"><span class="keyword">namespace</span> <span class="identifier">boost</span><span class="special">{</span> <span class="keyword">namespace</span> <span class="identifier">math</span><span class="special">{</span>

<span class="keyword">template</span> <span class="special">&lt;</span><span class="keyword">class</span> <span class="identifier">RealType</span> <span class="special">=</span> <span class="keyword">double</span><span class="special">,</span>
          <span class="keyword">class</span> <a class="link" href="../../../policy.html" title="Policies">Policy</a>   <span class="special">=</span> <a class="link" href="../../../policy/pol_ref/pol_ref_ref.html" title="Policy Class Reference">policies::policy&lt;&gt;</a> <span class="special">&gt;</span>
<span class="keyword">class</span> <span class="identifier">hypergeometric_distribution</span><span class="special">;</span>

<span class="keyword">template</span> <span class="special">&lt;</span><span class="keyword">class</span> <span class="identifier">RealType</span><span class="special">,</span> <span class="keyword">class</span> <span class="identifier">Policy</span><span class="special">&gt;</span>
<span class="keyword">class</span> <span class="identifier">hypergeometric_distribution</span>
<span class="special">{</span>
<span class="keyword">public</span><span class="special">:</span>
   <span class="keyword">typedef</span> <span class="identifier">RealType</span> <span class="identifier">value_type</span><span class="special">;</span>
   <span class="keyword">typedef</span> <span class="identifier">Policy</span>   <span class="identifier">policy_type</span><span class="special">;</span>
   <span class="comment">// Construct:</span>
   <span class="identifier">hypergeometric_distribution</span><span class="special">(</span><span class="keyword">unsigned</span> <span class="identifier">r</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">n</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">N</span><span class="special">);</span>
   <span class="comment">// Accessors:</span>
   <span class="keyword">unsigned</span> <span class="identifier">total</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
   <span class="keyword">unsigned</span> <span class="identifier">defective</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
   <span class="keyword">unsigned</span> <span class="identifier">sample_count</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
<span class="special">};</span>

<span class="keyword">typedef</span> <span class="identifier">hypergeometric_distribution</span><span class="special">&lt;&gt;</span> <span class="identifier">hypergeometric</span><span class="special">;</span>

<span class="special">}}</span> <span class="comment">// namespaces</span>
</pre>
<p>
            The hypergeometric distribution describes the number of "events"
            <span class="emphasis"><em>k</em></span> from a sample <span class="emphasis"><em>n</em></span> drawn from
            a total population <span class="emphasis"><em>N</em></span> <span class="emphasis"><em>without replacement</em></span>.
          </p>
<p>
            Imagine we have a sample of <span class="emphasis"><em>N</em></span> objects of which
            <span class="emphasis"><em>r</em></span> are "defective" and N-r are "not
            defective" (the terms "success/failure" or "red/blue"
            are also used). If we sample <span class="emphasis"><em>n</em></span> items <span class="emphasis"><em>without
            replacement</em></span> then what is the probability that exactly <span class="emphasis"><em>k</em></span>
            items in the sample are defective? The answer is given by the pdf of
            the hypergeometric distribution <code class="computeroutput"><span class="identifier">f</span><span class="special">(</span><span class="identifier">k</span><span class="special">;</span> <span class="identifier">r</span><span class="special">,</span> <span class="identifier">n</span><span class="special">,</span> <span class="identifier">N</span><span class="special">)</span></code>, whilst the probability of <span class="emphasis"><em>k</em></span>
            defectives or fewer is given by F(k; r, n, N), where F(k) is the CDF
            of the hypergeometric distribution.
          </p>
<div class="note"><table border="0" summary="Note">
<tr>
<td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="../../../../../../../../../doc/src/images/note.png"></td>
<th align="left">Note</th>
</tr>
<tr><td align="left" valign="top"><p>
              Unlike almost all of the other distributions in this library, the hypergeometric
              distribution is strictly discrete: it can not be extended to real valued
              arguments of its parameters or random variable.
            </p></td></tr>
</table></div>
<p>
            The following graph shows how the distribution changes as the proportion
            of "defective" items changes, while keeping the population
            and sample sizes constant:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../graphs/hypergeometric_pdf_1.png" align="middle"></span>
          </p>
<p>
            Note that since the distribution is symmetrical in parameters <span class="emphasis"><em>n</em></span>
            and <span class="emphasis"><em>r</em></span>, if we change the sample size and keep the
            population and proportion "defective" the same then we obtain
            basically the same graphs:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../graphs/hypergeometric_pdf_2.png" align="middle"></span>
          </p>
<h5>
<a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.h0"></a>
            <span><a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.member_functions"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist.dist_ref.dists.hypergeometric_dist.member_functions">Member
            Functions</a>
          </h5>
<pre class="programlisting"><span class="identifier">hypergeometric_distribution</span><span class="special">(</span><span class="keyword">unsigned</span> <span class="identifier">r</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">n</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">N</span><span class="special">);</span>
</pre>
<p>
            Constructs a hypergeometric distribution with with a population of <span class="emphasis"><em>N</em></span>
            objects, of which <span class="emphasis"><em>r</em></span> are defective, and from which
            <span class="emphasis"><em>n</em></span> are sampled.
          </p>
<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">total</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
</pre>
<p>
            Returns the total number of objects <span class="emphasis"><em>N</em></span>.
          </p>
<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">defective</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
</pre>
<p>
            Returns the number of objects <span class="emphasis"><em>r</em></span> in population <span class="emphasis"><em>N</em></span>
            which are defective.
          </p>
<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">sample_count</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
</pre>
<p>
            Returns the number of objects <span class="emphasis"><em>n</em></span> which are sampled
            from the population <span class="emphasis"><em>N</em></span>.
          </p>
<h5>
<a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.h1"></a>
            <span><a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.non_member_accessors"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist.dist_ref.dists.hypergeometric_dist.non_member_accessors">Non-member
            Accessors</a>
          </h5>
<p>
            All the <a class="link" href="../nmp.html" title="Non-Member Properties">usual non-member
            accessor functions</a> that are generic to all distributions are supported:
            <a class="link" href="../nmp.html#math.dist.cdf">Cumulative Distribution Function</a>,
            <a class="link" href="../nmp.html#math.dist.pdf">Probability Density Function</a>, <a class="link" href="../nmp.html#math.dist.quantile">Quantile</a>, <a class="link" href="../nmp.html#math.dist.hazard">Hazard
            Function</a>, <a class="link" href="../nmp.html#math.dist.chf">Cumulative Hazard Function</a>,
            <a class="link" href="../nmp.html#math.dist.mean">mean</a>, <a class="link" href="../nmp.html#math.dist.median">median</a>,
            <a class="link" href="../nmp.html#math.dist.mode">mode</a>, <a class="link" href="../nmp.html#math.dist.variance">variance</a>,
            <a class="link" href="../nmp.html#math.dist.sd">standard deviation</a>, <a class="link" href="../nmp.html#math.dist.skewness">skewness</a>,
            <a class="link" href="../nmp.html#math.dist.kurtosis">kurtosis</a>, <a class="link" href="../nmp.html#math.dist.kurtosis_excess">kurtosis_excess</a>,
            <a class="link" href="../nmp.html#math.dist.range">range</a> and <a class="link" href="../nmp.html#math.dist.support">support</a>.
          </p>
<p>
            The domain of the random variable is the unsigned integers in the range
            [max(0, n + r - N), min(n, r)]. A <a class="link" href="../../../main_overview/error_handling.html#domain_error">domain_error</a>
            is raised if the random variable is outside this range, or is not an
            integral value.
          </p>
<div class="caution"><table border="0" summary="Caution">
<tr>
<td rowspan="2" align="center" valign="top" width="25"><img alt="[Caution]" src="../../../../../../../../../doc/src/images/caution.png"></td>
<th align="left">Caution</th>
</tr>
<tr><td align="left" valign="top">
<p>
              The quantile function will by default return an integer result that
              has been <span class="emphasis"><em>rounded outwards</em></span>. That is to say lower
              quantiles (where the probability is less than 0.5) are rounded downward,
              and upper quantiles (where the probability is greater than 0.5) are
              rounded upwards. This behaviour ensures that if an X% quantile is requested,
              then <span class="emphasis"><em>at least</em></span> the requested coverage will be present
              in the central region, and <span class="emphasis"><em>no more than</em></span> the requested
              coverage will be present in the tails.
            </p>
<p>
              This behaviour can be changed so that the quantile functions are rounded
              differently using <a class="link" href="../../../policy/pol_overview.html" title="Policy Overview">Policies</a>.
              It is strongly recommended that you read the tutorial <a class="link" href="../../../policy/pol_tutorial/understand_dis_quant.html" title="Understanding Quantiles of Discrete Distributions">Understanding
              Quantiles of Discrete Distributions</a> before using the quantile
              function on the Hypergeometric distribution. The <a class="link" href="../../../policy/pol_ref/discrete_quant_ref.html" title="Discrete Quantile Policies">reference
              docs</a> describe how to change the rounding policy for these distributions.
            </p>
<p>
              However, note that the implementation method of the quantile function
              always returns an integral value, therefore attempting to use a <a class="link" href="../../../policy.html" title="Policies">Policy</a> that requires (or produces)
              a real valued result will result in a compile time error.
            </p>
</td></tr>
</table></div>
<h5>
<a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.h2"></a>
            <span><a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.accuracy"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist.dist_ref.dists.hypergeometric_dist.accuracy">Accuracy</a>
          </h5>
<p>
            For small N such that <code class="computeroutput"><span class="identifier">N</span> <span class="special">&lt;</span> <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code>
            then table based lookup of the results gives an accuracy to a few epsilon.
            <code class="computeroutput"><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code> is 170 at double or long double
            precision.
          </p>
<p>
            For larger N such that <code class="computeroutput"><span class="identifier">N</span> <span class="special">&lt;</span> <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">prime</span><span class="special">(</span><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_prime</span><span class="special">)</span></code> then only basic arithmetic is required
            for the calculation and the accuracy is typically &lt; 20 epsilon. This
            takes care of N up to 104729.
          </p>
<p>
            For <code class="computeroutput"><span class="identifier">N</span> <span class="special">&gt;</span>
            <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">prime</span><span class="special">(</span><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_prime</span><span class="special">)</span></code>
            then accuracy quickly degrades, with 5 or 6 decimal digits being lost
            for N = 110000.
          </p>
<p>
            In general for very large N, the user should expect to loose log<sub>10</sub>N decimal
            digits of precision during the calculation, with the results becoming
            meaningless for N &gt;= 10<sup>15</sup>.
          </p>
<h5>
<a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.h3"></a>
            <span><a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.testing"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist.dist_ref.dists.hypergeometric_dist.testing">Testing</a>
          </h5>
<p>
            There are three sets of tests: our implementation is tested against a
            table of values produced by Mathematica's implementation of this distribution.
            We also sanity check our implementation against some spot values computed
            using the online calculator here <a href="http://stattrek.com/Tables/Hypergeometric.aspx" target="_top">http://stattrek.com/Tables/Hypergeometric.aspx</a>.
            Finally we test accuracy against some high precision test data using
            this implementation and NTL::RR.
          </p>
<h5>
<a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.h4"></a>
            <span><a name="math_toolkit.dist.dist_ref.dists.hypergeometric_dist.implementation"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist.dist_ref.dists.hypergeometric_dist.implementation">Implementation</a>
          </h5>
<p>
            The PDF can be calculated directly using the formula:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../equations/hypergeometric1.png"></span>
          </p>
<p>
            However, this can only be used directly when the largest of the factorials
            is guaranteed not to overflow the floating point representation used.
            This formula is used directly when <code class="computeroutput"><span class="identifier">N</span>
            <span class="special">&lt;</span> <span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code>
            in which case table lookup of the factorials gives a rapid and accurate
            implementation method.
          </p>
<p>
            For larger <span class="emphasis"><em>N</em></span> the method described in "An Accurate
            Computation of the Hypergeometric Distribution Function", Trong
            Wu, ACM Transactions on Mathematical Software, Vol. 19, No. 1, March
            1993, Pages 33-43 is used. The method relies on the fact that there is
            an easy method for factorising a factorial into the product of prime
            numbers:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../equations/hypergeometric2.png"></span>
          </p>
<p>
            Where p<sub>i</sub> is the i'th prime number, and e<sub>i</sub> is a small positive integer or
            zero, which can be calculated via:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../equations/hypergeometric3.png"></span>
          </p>
<p>
            Further we can combine the factorials in the expression for the PDF to
            yield the PDF directly as the product of prime numbers:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../equations/hypergeometric4.png"></span>
          </p>
<p>
            With this time the exponents e<sub>i</sub> being either positive, negative or zero.
            Indeed such a degree of cancellation occurs in the calculation of the
            e<sub>i</sub> that many are zero, and typically most have a magnitude or no more
            than 1 or 2.
          </p>
<p>
            Calculation of the product of the primes requires some care to prevent
            numerical overflow, we use a novel recursive method which splits the
            calculation into a series of sub-products, with a new sub-product started
            each time the next multiplication would cause either overflow or underflow.
            The sub-products are stored in a linked list on the program stack, and
            combined in an order that will guarantee no overflow or unnecessary-underflow
            once the last sub-product has been calculated.
          </p>
<p>
            This method can be used as long as N is smaller than the largest prime
            number we have stored in our table of primes (currently 104729). The
            method is relatively slow (calculating the exponents requires the most
            time), but requires only a small number of arithmetic operations to calculate
            the result (indeed there is no shorter method involving only basic arithmetic
            once the exponents have been found), the method is therefore much more
            accurate than the alternatives.
          </p>
<p>
            For much larger N, we can calculate the PDF from the factorials using
            either lgamma, or by directly combining lanczos approximations to avoid
            calculating via logarithms. We use the latter method, as it is usually
            1 or 2 decimal digits more accurate than computing via logarithms with
            lgamma. However, in this area where N &gt; 104729, the user should expect
            to loose around log<sub>10</sub>N decimal digits during the calculation in the worst
            case.
          </p>
<p>
            The CDF and its complement is calculated by directly summing the PDF's.
            We start by deciding whether the CDF, or its complement, is likely to
            be the smaller of the two and then calculate the PDF at <span class="emphasis"><em>k</em></span>
            (or <span class="emphasis"><em>k+1</em></span> if we're calculating the complement) and
            calculate successive PDF values via the recurrence relations:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../equations/hypergeometric5.png"></span>
          </p>
<p>
            Until we either reach the end of the distributions domain, or the next
            PDF value to be summed would be too small to affect the result.
          </p>
<p>
            The quantile is calculated in a similar manner to the CDF: we first guess
            which end of the distribution we're nearer to, and then sum PDFs starting
            from the end of the distribution this time, until we have some value
            <span class="emphasis"><em>k</em></span> that gives the required CDF.
          </p>
<p>
            The median is simply the quantile at 0.5, and the remaining properties
            are calculated via:
          </p>
<p>
            <span class="inlinemediaobject"><img src="../../../../../equations/hypergeometric6.png"></span>
          </p>
</div>
<table xmlns:rev="http://www.cs.rpi.edu/~gregod/boost/tools/doc/revision" width="100%"><tr>
<td align="left"></td>
<td align="right"><div class="copyright-footer">Copyright &#169; 2006-2010 John Maddock, Paul A. Bristow, Hubert Holin, Xiaogang Zhang, Bruno
      Lalande, Johan R&#229;de, Gautam Sewani, Thijs van den Berg and Benjamin Sobotta<p>
        Distributed under the Boost Software License, Version 1.0. (See accompanying
        file LICENSE_1_0.txt or copy at <a href="http://www.boost.org/LICENSE_1_0.txt" target="_top">http://www.boost.org/LICENSE_1_0.txt</a>)
      </p>
</div></td>
</tr></table>
<hr>
<div class="spirit-nav">
<a accesskey="p" href="geometric_dist.html"><img src="../../../../../../../../../doc/src/images/prev.png" alt="Prev"></a><a accesskey="u" href="../dists.html"><img src="../../../../../../../../../doc/src/images/up.png" alt="Up"></a><a accesskey="h" href="../../../../index.html"><img src="../../../../../../../../../doc/src/images/home.png" alt="Home"></a><a accesskey="n" href="inverse_chi_squared_dist.html"><img src="../../../../../../../../../doc/src/images/next.png" alt="Next"></a>
</div>
</body>
</html>