distribution.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. # coding=utf-8
  2. import bisect
  3. from faker.generator import random as mod_random
  4. def random_sample(random=None):
  5. if random is None:
  6. random = mod_random
  7. return random.uniform(0.0, 1.0)
  8. def cumsum(it):
  9. total = 0
  10. for x in it:
  11. total += x
  12. yield total
  13. def choices_distribution_unique(a, p, random=None, length=1):
  14. # As of Python 3.7, there isn't a way to sample unique elements that takes
  15. # weight into account.
  16. if random is None:
  17. random = mod_random
  18. assert len(a) == len(p)
  19. assert len(a) >= length, "You can't request more unique samples than elements in the dataset."
  20. choices = []
  21. items = list(a)
  22. probabilities = list(p)
  23. for i in range(length):
  24. cdf = list(cumsum(probabilities))
  25. normal = cdf[-1]
  26. cdf2 = [float(i) / float(normal) for i in cdf]
  27. uniform_sample = random_sample(random=random)
  28. idx = bisect.bisect_right(cdf2, uniform_sample)
  29. item = items[idx]
  30. choices.append(item)
  31. probabilities.pop(idx)
  32. items.pop(idx)
  33. return choices
  34. def choices_distribution(a, p, random=None, length=1):
  35. if random is None:
  36. random = mod_random
  37. assert len(a) == len(p)
  38. if hasattr(random, 'choices'):
  39. choices = random.choices(a, weights=p, k=length)
  40. return choices
  41. else:
  42. choices = []
  43. cdf = list(cumsum(p))
  44. normal = cdf[-1]
  45. cdf2 = [float(i) / float(normal) for i in cdf]
  46. for i in range(length):
  47. uniform_sample = random_sample(random=random)
  48. idx = bisect.bisect_right(cdf2, uniform_sample)
  49. item = a[idx]
  50. choices.append(item)
  51. return choices