test_contingency.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. from __future__ import division, print_function, absolute_import
  2. import numpy as np
  3. from numpy.testing import (assert_equal, assert_array_equal,
  4. assert_array_almost_equal, assert_approx_equal, assert_allclose)
  5. from pytest import raises as assert_raises
  6. from scipy.special import xlogy
  7. from scipy.stats.contingency import margins, expected_freq, chi2_contingency
  8. def test_margins():
  9. a = np.array([1])
  10. m = margins(a)
  11. assert_equal(len(m), 1)
  12. m0 = m[0]
  13. assert_array_equal(m0, np.array([1]))
  14. a = np.array([[1]])
  15. m0, m1 = margins(a)
  16. expected0 = np.array([[1]])
  17. expected1 = np.array([[1]])
  18. assert_array_equal(m0, expected0)
  19. assert_array_equal(m1, expected1)
  20. a = np.arange(12).reshape(2, 6)
  21. m0, m1 = margins(a)
  22. expected0 = np.array([[15], [51]])
  23. expected1 = np.array([[6, 8, 10, 12, 14, 16]])
  24. assert_array_equal(m0, expected0)
  25. assert_array_equal(m1, expected1)
  26. a = np.arange(24).reshape(2, 3, 4)
  27. m0, m1, m2 = margins(a)
  28. expected0 = np.array([[[66]], [[210]]])
  29. expected1 = np.array([[[60], [92], [124]]])
  30. expected2 = np.array([[[60, 66, 72, 78]]])
  31. assert_array_equal(m0, expected0)
  32. assert_array_equal(m1, expected1)
  33. assert_array_equal(m2, expected2)
  34. def test_expected_freq():
  35. assert_array_equal(expected_freq([1]), np.array([1.0]))
  36. observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
  37. e = expected_freq(observed)
  38. assert_array_equal(e, np.ones_like(observed))
  39. observed = np.array([[10, 10, 20], [20, 20, 20]])
  40. e = expected_freq(observed)
  41. correct = np.array([[12., 12., 16.], [18., 18., 24.]])
  42. assert_array_almost_equal(e, correct)
  43. def test_chi2_contingency_trivial():
  44. # Some very simple tests for chi2_contingency.
  45. # A trivial case
  46. obs = np.array([[1, 2], [1, 2]])
  47. chi2, p, dof, expected = chi2_contingency(obs, correction=False)
  48. assert_equal(chi2, 0.0)
  49. assert_equal(p, 1.0)
  50. assert_equal(dof, 1)
  51. assert_array_equal(obs, expected)
  52. # A *really* trivial case: 1-D data.
  53. obs = np.array([1, 2, 3])
  54. chi2, p, dof, expected = chi2_contingency(obs, correction=False)
  55. assert_equal(chi2, 0.0)
  56. assert_equal(p, 1.0)
  57. assert_equal(dof, 0)
  58. assert_array_equal(obs, expected)
  59. def test_chi2_contingency_R():
  60. # Some test cases that were computed independently, using R.
  61. Rcode = \
  62. """
  63. # Data vector.
  64. data <- c(
  65. 12, 34, 23, 4, 47, 11,
  66. 35, 31, 11, 34, 10, 18,
  67. 12, 32, 9, 18, 13, 19,
  68. 12, 12, 14, 9, 33, 25
  69. )
  70. # Create factor tags:r=rows, c=columns, t=tiers
  71. r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
  72. c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
  73. t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
  74. # 3-way Chi squared test of independence
  75. s = summary(xtabs(data~r+c+t))
  76. print(s)
  77. """
  78. Routput = \
  79. """
  80. Call: xtabs(formula = data ~ r + c + t)
  81. Number of cases in table: 478
  82. Number of factors: 3
  83. Test for independence of all factors:
  84. Chisq = 102.17, df = 17, p-value = 3.514e-14
  85. """
  86. obs = np.array(
  87. [[[12, 34, 23],
  88. [35, 31, 11],
  89. [12, 32, 9],
  90. [12, 12, 14]],
  91. [[4, 47, 11],
  92. [34, 10, 18],
  93. [18, 13, 19],
  94. [9, 33, 25]]])
  95. chi2, p, dof, expected = chi2_contingency(obs)
  96. assert_approx_equal(chi2, 102.17, significant=5)
  97. assert_approx_equal(p, 3.514e-14, significant=4)
  98. assert_equal(dof, 17)
  99. Rcode = \
  100. """
  101. # Data vector.
  102. data <- c(
  103. #
  104. 12, 17,
  105. 11, 16,
  106. #
  107. 11, 12,
  108. 15, 16,
  109. #
  110. 23, 15,
  111. 30, 22,
  112. #
  113. 14, 17,
  114. 15, 16
  115. )
  116. # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
  117. r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
  118. c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
  119. d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
  120. t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
  121. # 4-way Chi squared test of independence
  122. s = summary(xtabs(data~r+c+d+t))
  123. print(s)
  124. """
  125. Routput = \
  126. """
  127. Call: xtabs(formula = data ~ r + c + d + t)
  128. Number of cases in table: 262
  129. Number of factors: 4
  130. Test for independence of all factors:
  131. Chisq = 8.758, df = 11, p-value = 0.6442
  132. """
  133. obs = np.array(
  134. [[[[12, 17],
  135. [11, 16]],
  136. [[11, 12],
  137. [15, 16]]],
  138. [[[23, 15],
  139. [30, 22]],
  140. [[14, 17],
  141. [15, 16]]]])
  142. chi2, p, dof, expected = chi2_contingency(obs)
  143. assert_approx_equal(chi2, 8.758, significant=4)
  144. assert_approx_equal(p, 0.6442, significant=4)
  145. assert_equal(dof, 11)
  146. def test_chi2_contingency_g():
  147. c = np.array([[15, 60], [15, 90]])
  148. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
  149. assert_allclose(g, 2*xlogy(c, c/e).sum())
  150. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
  151. c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
  152. assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
  153. c = np.array([[10, 12, 10], [12, 10, 10]])
  154. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
  155. assert_allclose(g, 2*xlogy(c, c/e).sum())
  156. def test_chi2_contingency_bad_args():
  157. # Test that "bad" inputs raise a ValueError.
  158. # Negative value in the array of observed frequencies.
  159. obs = np.array([[-1, 10], [1, 2]])
  160. assert_raises(ValueError, chi2_contingency, obs)
  161. # The zeros in this will result in zeros in the array
  162. # of expected frequencies.
  163. obs = np.array([[0, 1], [0, 1]])
  164. assert_raises(ValueError, chi2_contingency, obs)
  165. # A degenerate case: `observed` has size 0.
  166. obs = np.empty((0, 8))
  167. assert_raises(ValueError, chi2_contingency, obs)