contingency.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. """Some functions for working with contingency tables (i.e. cross tabulations).
  2. """
  3. from __future__ import division, print_function, absolute_import
  4. from functools import reduce
  5. import numpy as np
  6. from .stats import power_divergence
  7. __all__ = ['margins', 'expected_freq', 'chi2_contingency']
  8. def margins(a):
  9. """Return a list of the marginal sums of the array `a`.
  10. Parameters
  11. ----------
  12. a : ndarray
  13. The array for which to compute the marginal sums.
  14. Returns
  15. -------
  16. margsums : list of ndarrays
  17. A list of length `a.ndim`. `margsums[k]` is the result
  18. of summing `a` over all axes except `k`; it has the same
  19. number of dimensions as `a`, but the length of each axis
  20. except axis `k` will be 1.
  21. Examples
  22. --------
  23. >>> a = np.arange(12).reshape(2, 6)
  24. >>> a
  25. array([[ 0, 1, 2, 3, 4, 5],
  26. [ 6, 7, 8, 9, 10, 11]])
  27. >>> m0, m1 = margins(a)
  28. >>> m0
  29. array([[15],
  30. [51]])
  31. >>> m1
  32. array([[ 6, 8, 10, 12, 14, 16]])
  33. >>> b = np.arange(24).reshape(2,3,4)
  34. >>> m0, m1, m2 = margins(b)
  35. >>> m0
  36. array([[[ 66]],
  37. [[210]]])
  38. >>> m1
  39. array([[[ 60],
  40. [ 92],
  41. [124]]])
  42. >>> m2
  43. array([[[60, 66, 72, 78]]])
  44. """
  45. margsums = []
  46. ranged = list(range(a.ndim))
  47. for k in ranged:
  48. marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
  49. margsums.append(marg)
  50. return margsums
  51. def expected_freq(observed):
  52. """
  53. Compute the expected frequencies from a contingency table.
  54. Given an n-dimensional contingency table of observed frequencies,
  55. compute the expected frequencies for the table based on the marginal
  56. sums under the assumption that the groups associated with each
  57. dimension are independent.
  58. Parameters
  59. ----------
  60. observed : array_like
  61. The table of observed frequencies. (While this function can handle
  62. a 1-D array, that case is trivial. Generally `observed` is at
  63. least 2-D.)
  64. Returns
  65. -------
  66. expected : ndarray of float64
  67. The expected frequencies, based on the marginal sums of the table.
  68. Same shape as `observed`.
  69. Examples
  70. --------
  71. >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
  72. >>> from scipy.stats import expected_freq
  73. >>> expected_freq(observed)
  74. array([[ 12., 12., 16.],
  75. [ 18., 18., 24.]])
  76. """
  77. # Typically `observed` is an integer array. If `observed` has a large
  78. # number of dimensions or holds large values, some of the following
  79. # computations may overflow, so we first switch to floating point.
  80. observed = np.asarray(observed, dtype=np.float64)
  81. # Create a list of the marginal sums.
  82. margsums = margins(observed)
  83. # Create the array of expected frequencies. The shapes of the
  84. # marginal sums returned by apply_over_axes() are just what we
  85. # need for broadcasting in the following product.
  86. d = observed.ndim
  87. expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
  88. return expected
  89. def chi2_contingency(observed, correction=True, lambda_=None):
  90. """Chi-square test of independence of variables in a contingency table.
  91. This function computes the chi-square statistic and p-value for the
  92. hypothesis test of independence of the observed frequencies in the
  93. contingency table [1]_ `observed`. The expected frequencies are computed
  94. based on the marginal sums under the assumption of independence; see
  95. `scipy.stats.contingency.expected_freq`. The number of degrees of
  96. freedom is (expressed using numpy functions and attributes)::
  97. dof = observed.size - sum(observed.shape) + observed.ndim - 1
  98. Parameters
  99. ----------
  100. observed : array_like
  101. The contingency table. The table contains the observed frequencies
  102. (i.e. number of occurrences) in each category. In the two-dimensional
  103. case, the table is often described as an "R x C table".
  104. correction : bool, optional
  105. If True, *and* the degrees of freedom is 1, apply Yates' correction
  106. for continuity. The effect of the correction is to adjust each
  107. observed value by 0.5 towards the corresponding expected value.
  108. lambda_ : float or str, optional.
  109. By default, the statistic computed in this test is Pearson's
  110. chi-squared statistic [2]_. `lambda_` allows a statistic from the
  111. Cressie-Read power divergence family [3]_ to be used instead. See
  112. `power_divergence` for details.
  113. Returns
  114. -------
  115. chi2 : float
  116. The test statistic.
  117. p : float
  118. The p-value of the test
  119. dof : int
  120. Degrees of freedom
  121. expected : ndarray, same shape as `observed`
  122. The expected frequencies, based on the marginal sums of the table.
  123. See Also
  124. --------
  125. contingency.expected_freq
  126. fisher_exact
  127. chisquare
  128. power_divergence
  129. Notes
  130. -----
  131. An often quoted guideline for the validity of this calculation is that
  132. the test should be used only if the observed and expected frequencies
  133. in each cell are at least 5.
  134. This is a test for the independence of different categories of a
  135. population. The test is only meaningful when the dimension of
  136. `observed` is two or more. Applying the test to a one-dimensional
  137. table will always result in `expected` equal to `observed` and a
  138. chi-square statistic equal to 0.
  139. This function does not handle masked arrays, because the calculation
  140. does not make sense with missing values.
  141. Like stats.chisquare, this function computes a chi-square statistic;
  142. the convenience this function provides is to figure out the expected
  143. frequencies and degrees of freedom from the given contingency table.
  144. If these were already known, and if the Yates' correction was not
  145. required, one could use stats.chisquare. That is, if one calls::
  146. chi2, p, dof, ex = chi2_contingency(obs, correction=False)
  147. then the following is true::
  148. (chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
  149. ddof=obs.size - 1 - dof)
  150. The `lambda_` argument was added in version 0.13.0 of scipy.
  151. References
  152. ----------
  153. .. [1] "Contingency table",
  154. https://en.wikipedia.org/wiki/Contingency_table
  155. .. [2] "Pearson's chi-squared test",
  156. https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
  157. .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
  158. Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
  159. pp. 440-464.
  160. Examples
  161. --------
  162. A two-way example (2 x 3):
  163. >>> from scipy.stats import chi2_contingency
  164. >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
  165. >>> chi2_contingency(obs)
  166. (2.7777777777777777,
  167. 0.24935220877729619,
  168. 2,
  169. array([[ 12., 12., 16.],
  170. [ 18., 18., 24.]]))
  171. Perform the test using the log-likelihood ratio (i.e. the "G-test")
  172. instead of Pearson's chi-squared statistic.
  173. >>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
  174. >>> g, p
  175. (2.7688587616781319, 0.25046668010954165)
  176. A four-way example (2 x 2 x 2 x 2):
  177. >>> obs = np.array(
  178. ... [[[[12, 17],
  179. ... [11, 16]],
  180. ... [[11, 12],
  181. ... [15, 16]]],
  182. ... [[[23, 15],
  183. ... [30, 22]],
  184. ... [[14, 17],
  185. ... [15, 16]]]])
  186. >>> chi2_contingency(obs)
  187. (8.7584514426741897,
  188. 0.64417725029295503,
  189. 11,
  190. array([[[[ 14.15462386, 14.15462386],
  191. [ 16.49423111, 16.49423111]],
  192. [[ 11.2461395 , 11.2461395 ],
  193. [ 13.10500554, 13.10500554]]],
  194. [[[ 19.5591166 , 19.5591166 ],
  195. [ 22.79202844, 22.79202844]],
  196. [[ 15.54012004, 15.54012004],
  197. [ 18.10873492, 18.10873492]]]]))
  198. """
  199. observed = np.asarray(observed)
  200. if np.any(observed < 0):
  201. raise ValueError("All values in `observed` must be nonnegative.")
  202. if observed.size == 0:
  203. raise ValueError("No data; `observed` has size 0.")
  204. expected = expected_freq(observed)
  205. if np.any(expected == 0):
  206. # Include one of the positions where expected is zero in
  207. # the exception message.
  208. zeropos = list(zip(*np.nonzero(expected == 0)))[0]
  209. raise ValueError("The internally computed table of expected "
  210. "frequencies has a zero element at %s." % (zeropos,))
  211. # The degrees of freedom
  212. dof = expected.size - sum(expected.shape) + expected.ndim - 1
  213. if dof == 0:
  214. # Degenerate case; this occurs when `observed` is 1D (or, more
  215. # generally, when it has only one nontrivial dimension). In this
  216. # case, we also have observed == expected, so chi2 is 0.
  217. chi2 = 0.0
  218. p = 1.0
  219. else:
  220. if dof == 1 and correction:
  221. # Adjust `observed` according to Yates' correction for continuity.
  222. observed = observed + 0.5 * np.sign(expected - observed)
  223. chi2, p = power_divergence(observed, expected,
  224. ddof=observed.size - 1 - dof, axis=None,
  225. lambda_=lambda_)
  226. return chi2, p, dof, expected