markov.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. from __future__ import division
  2. # use cPickle when using python2 for better performance
  3. try:
  4. import cPickle as pickle
  5. except ImportError:
  6. import pickle
  7. import logging
  8. import os
  9. import random
  10. import re
  11. from collections import defaultdict
  12. PUNCTUATION = re.compile(r"([\.,;!?])")
  13. def tokenize(text):
  14. text = PUNCTUATION.sub(r" \1 ", text)
  15. return text.split()
  16. class StringContinuationImpossibleError(Exception):
  17. pass
  18. def _wordIter(text, separator='.'):
  19. """
  20. An iterator over the 'words' in the given text, as defined by
  21. the regular expression given as separator.
  22. """
  23. exp = re.compile(separator)
  24. pos = 0
  25. for occ in exp.finditer(text):
  26. sub = text[pos:occ.start()].strip()
  27. if sub:
  28. yield sub
  29. pos = occ.start() + 1
  30. if pos < len(text):
  31. # take case of the last part
  32. sub = text[pos:].strip()
  33. if sub:
  34. yield sub
  35. class MarkovChain(object):
  36. def __init__(self, dbFilePath=None):
  37. self.dbFilePath = dbFilePath
  38. if not dbFilePath:
  39. self.dbFilePath = os.path.join(os.path.dirname(__file__), "markovdb")
  40. try:
  41. with open(self.dbFilePath, 'rb') as dbfile:
  42. self.db = pickle.load(dbfile)
  43. except (IOError, ValueError):
  44. logging.warn('Database file corrupt or not found, using empty database')
  45. self.db = defaultdict(lambda: defaultdict(lambda: 1.0))
  46. def generateDatabase(self, textSample, sentenceSep='[.!?\n]', n=2):
  47. """ Generate word probability database from raw content string """
  48. # I'm using the database to temporarily store word counts
  49. textSample = _wordIter(textSample, sentenceSep) # get an iterator for the 'sentences'
  50. # We're using '' as special symbol for the beginning
  51. # of a sentence
  52. self.db[('',)][''] = 0.0
  53. for line in textSample:
  54. words = line.strip().split() # split words in line
  55. if len(words) == 0:
  56. continue
  57. # first word follows a sentence end
  58. self.db[("",)][words[0]] += 1
  59. for order in range(1, n+1):
  60. for i in range(len(words) - 1):
  61. if i + order >= len(words):
  62. continue
  63. word = tuple(words[i:i + order])
  64. self.db[word][words[i + order]] += 1
  65. # last word precedes a sentence end
  66. self.db[tuple(words[len(words) - order:len(words)])][""] += 1
  67. # We've now got the db filled with parametrized word counts
  68. # We still need to normalize this to represent probabilities
  69. for word in self.db:
  70. wordsum = 0
  71. for nextword in self.db[word]:
  72. wordsum += self.db[word][nextword]
  73. if wordsum != 0:
  74. for nextword in self.db[word]:
  75. self.db[word][nextword] /= wordsum
  76. def dumpdb(self):
  77. try:
  78. with open(self.dbFilePath, 'wb') as dbfile:
  79. pickle.dump(self.db, dbfile)
  80. # It looks like db was written successfully
  81. return True
  82. except IOError:
  83. logging.warn('Database file could not be written')
  84. return False
  85. def generateString(self):
  86. """ Generate a "sentence" with the database of known text """
  87. return self._accumulateWithSeed(('',))
  88. def generateStringWithSeed(self, seed):
  89. """ Generate a "sentence" with the database and a given word """
  90. # using str.split here means we're contructing the list in memory
  91. # but as the generated sentence only depends on the last word of the seed
  92. # I'm assuming seeds tend to be rather short.
  93. words = seed.split()
  94. if (words[-1],) not in self.db:
  95. # The only possible way it won't work is if the last word is not known
  96. raise StringContinuationImpossibleError('Could not continue string: '
  97. + seed)
  98. return self._accumulateWithSeed(words)
  99. def _accumulateWithSeed(self, seed):
  100. """ Accumulate the generated sentence with a given single word as a
  101. seed """
  102. nextWord = self._nextWord(seed)
  103. sentence = list(seed) if seed else []
  104. while nextWord:
  105. sentence.append(nextWord)
  106. nextWord = self._nextWord(sentence)
  107. return ' '.join(sentence).strip()
  108. def _nextWord(self, lastwords):
  109. lastwords = tuple(lastwords)
  110. if lastwords != ('',):
  111. while lastwords not in self.db:
  112. lastwords = lastwords[1:]
  113. if not lastwords:
  114. return ''
  115. probmap = self.db[lastwords]
  116. sample = random.random()
  117. # since rounding errors might make us miss out on some words
  118. maxprob = 0.0
  119. maxprobword = ""
  120. for candidate in probmap:
  121. # remember which word had the highest probability
  122. # this is the word we'll default to if we can't find anythin else
  123. if probmap[candidate] > maxprob:
  124. maxprob = probmap[candidate]
  125. maxprobword = candidate
  126. if sample > probmap[candidate]:
  127. sample -= probmap[candidate]
  128. else:
  129. return candidate
  130. # getting here means we haven't found a matching word. :(
  131. return maxprobword
  132. # pylama:ignore=D