test_value_counts.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. these are systematically testing all of the args to value_counts
  3. with different size combinations. This is to ensure stability of the sorting
  4. and proper parameter handling
  5. """
  6. from itertools import product
  7. import numpy as np
  8. import pytest
  9. from pandas import DataFrame, MultiIndex, Series, date_range
  10. from pandas.util import testing as tm
  11. # our starting frame
  12. def seed_df(seed_nans, n, m):
  13. np.random.seed(1234)
  14. days = date_range('2015-08-24', periods=10)
  15. frame = DataFrame({
  16. '1st': np.random.choice(
  17. list('abcd'), n),
  18. '2nd': np.random.choice(days, n),
  19. '3rd': np.random.randint(1, m + 1, n)
  20. })
  21. if seed_nans:
  22. frame.loc[1::11, '1st'] = np.nan
  23. frame.loc[3::17, '2nd'] = np.nan
  24. frame.loc[7::19, '3rd'] = np.nan
  25. frame.loc[8::19, '3rd'] = np.nan
  26. frame.loc[9::19, '3rd'] = np.nan
  27. return frame
  28. # create input df, keys, and the bins
  29. binned = []
  30. ids = []
  31. for seed_nans in [True, False]:
  32. for n, m in product((100, 1000), (5, 20)):
  33. df = seed_df(seed_nans, n, m)
  34. bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
  35. keys = '1st', '2nd', ['1st', '2nd']
  36. for k, b in product(keys, bins):
  37. binned.append((df, k, b, n, m))
  38. ids.append("{}-{}-{}".format(k, n, m))
  39. @pytest.mark.slow
  40. @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
  41. def test_series_groupby_value_counts(df, keys, bins, n, m):
  42. def rebuild_index(df):
  43. arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
  44. df.index = MultiIndex.from_arrays(arr, names=df.index.names)
  45. return df
  46. for isort, normalize, sort, ascending, dropna \
  47. in product((False, True), repeat=5):
  48. kwargs = dict(normalize=normalize, sort=sort,
  49. ascending=ascending, dropna=dropna, bins=bins)
  50. gr = df.groupby(keys, sort=isort)
  51. left = gr['3rd'].value_counts(**kwargs)
  52. gr = df.groupby(keys, sort=isort)
  53. right = gr['3rd'].apply(Series.value_counts, **kwargs)
  54. right.index.names = right.index.names[:-1] + ['3rd']
  55. # have to sort on index because of unstable sort on values
  56. left, right = map(rebuild_index, (left, right)) # xref GH9212
  57. tm.assert_series_equal(left.sort_index(), right.sort_index())