wordnet.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. # Copyright 2009 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """This module contains low-level functions and a high-level class for parsing
  28. the prolog file "wn_s.pl" from the WordNet prolog download
  29. into an object suitable for looking up synonyms and performing query expansion.
  30. http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
  31. """
  32. from collections import defaultdict
  33. from whoosh.compat import iterkeys, text_type
  34. from whoosh.fields import Schema, ID, STORED
  35. def parse_file(f):
  36. """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
  37. word2nums and num2words.
  38. """
  39. word2nums = defaultdict(list)
  40. num2words = defaultdict(list)
  41. for line in f:
  42. if not line.startswith("s("):
  43. continue
  44. line = line[2:]
  45. num = int(line[:line.find(",")])
  46. qt = line.find("'")
  47. line = line[qt + 1:]
  48. qt = line.find("'")
  49. word = line[:qt].lower()
  50. if not word.isalpha():
  51. continue
  52. word2nums[word].append(num)
  53. num2words[num].append(word)
  54. return word2nums, num2words
  55. def make_index(storage, indexname, word2nums, num2words):
  56. """Creates a Whoosh index in the given storage object containing
  57. synonyms taken from word2nums and num2words. Returns the Index
  58. object.
  59. """
  60. schema = Schema(word=ID, syns=STORED)
  61. ix = storage.create_index(schema, indexname=indexname)
  62. w = ix.writer()
  63. for word in iterkeys(word2nums):
  64. syns = synonyms(word2nums, num2words, word)
  65. w.add_document(word=text_type(word), syns=syns)
  66. w.commit()
  67. return ix
  68. def synonyms(word2nums, num2words, word):
  69. """Uses the word2nums and num2words dicts to look up synonyms
  70. for the given word. Returns a list of synonym strings.
  71. """
  72. keys = word2nums[word]
  73. syns = set()
  74. for key in keys:
  75. syns = syns.union(num2words[key])
  76. if word in syns:
  77. syns.remove(word)
  78. return sorted(syns)
  79. class Thesaurus(object):
  80. """Represents the WordNet synonym database, either loaded into memory
  81. from the wn_s.pl Prolog file, or stored on disk in a Whoosh index.
  82. This class allows you to parse the prolog file "wn_s.pl" from the WordNet prolog
  83. download into an object suitable for looking up synonyms and performing query
  84. expansion.
  85. http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
  86. To load a Thesaurus object from the wn_s.pl file...
  87. >>> t = Thesaurus.from_filename("wn_s.pl")
  88. To save the in-memory Thesaurus to a Whoosh index...
  89. >>> from whoosh.filedb.filestore import FileStorage
  90. >>> fs = FileStorage("index")
  91. >>> t.to_storage(fs)
  92. To load a Thesaurus object from a Whoosh index...
  93. >>> t = Thesaurus.from_storage(fs)
  94. The Thesaurus object is thus usable in two ways:
  95. * Parse the wn_s.pl file into memory (Thesaurus.from_*) and then look up
  96. synonyms in memory. This has a startup cost for parsing the file, and uses
  97. quite a bit of memory to store two large dictionaries, however synonym
  98. look-ups are very fast.
  99. * Parse the wn_s.pl file into memory (Thesaurus.from_filename) then save it to
  100. an index (to_storage). From then on, open the thesaurus from the saved
  101. index (Thesaurus.from_storage). This has a large cost for storing the index,
  102. but after that it is faster to open the Thesaurus (than re-parsing the file)
  103. but slightly slower to look up synonyms.
  104. Here are timings for various tasks on my (fast) Windows machine, which might
  105. give an idea of relative costs for in-memory vs. on-disk.
  106. ================================================ ================
  107. Task Approx. time (s)
  108. ================================================ ================
  109. Parsing the wn_s.pl file 1.045
  110. Saving to an on-disk index 13.084
  111. Loading from an on-disk index 0.082
  112. Look up synonyms for "light" (in memory) 0.0011
  113. Look up synonyms for "light" (loaded from disk) 0.0028
  114. ================================================ ================
  115. Basically, if you can afford spending the memory necessary to parse the
  116. Thesaurus and then cache it, it's faster. Otherwise, use an on-disk index.
  117. """
  118. def __init__(self):
  119. self.w2n = None
  120. self.n2w = None
  121. self.searcher = None
  122. @classmethod
  123. def from_file(cls, fileobj):
  124. """Creates a Thesaurus object from the given file-like object, which should
  125. contain the WordNet wn_s.pl file.
  126. >>> f = open("wn_s.pl")
  127. >>> t = Thesaurus.from_file(f)
  128. >>> t.synonyms("hail")
  129. ['acclaim', 'come', 'herald']
  130. """
  131. thes = cls()
  132. thes.w2n, thes.n2w = parse_file(fileobj)
  133. return thes
  134. @classmethod
  135. def from_filename(cls, filename):
  136. """Creates a Thesaurus object from the given filename, which should
  137. contain the WordNet wn_s.pl file.
  138. >>> t = Thesaurus.from_filename("wn_s.pl")
  139. >>> t.synonyms("hail")
  140. ['acclaim', 'come', 'herald']
  141. """
  142. f = open(filename, "rb")
  143. try:
  144. return cls.from_file(f)
  145. finally:
  146. f.close()
  147. @classmethod
  148. def from_storage(cls, storage, indexname="THES"):
  149. """Creates a Thesaurus object from the given storage object,
  150. which should contain an index created by Thesaurus.to_storage().
  151. >>> from whoosh.filedb.filestore import FileStorage
  152. >>> fs = FileStorage("index")
  153. >>> t = Thesaurus.from_storage(fs)
  154. >>> t.synonyms("hail")
  155. ['acclaim', 'come', 'herald']
  156. :param storage: A :class:`whoosh.store.Storage` object from
  157. which to load the index.
  158. :param indexname: A name for the index. This allows you to
  159. store multiple indexes in the same storage object.
  160. """
  161. thes = cls()
  162. index = storage.open_index(indexname=indexname)
  163. thes.searcher = index.searcher()
  164. return thes
  165. def to_storage(self, storage, indexname="THES"):
  166. """Creates am index in the given storage object from the
  167. synonyms loaded from a WordNet file.
  168. >>> from whoosh.filedb.filestore import FileStorage
  169. >>> fs = FileStorage("index")
  170. >>> t = Thesaurus.from_filename("wn_s.pl")
  171. >>> t.to_storage(fs)
  172. :param storage: A :class:`whoosh.store.Storage` object in
  173. which to save the index.
  174. :param indexname: A name for the index. This allows you to
  175. store multiple indexes in the same storage object.
  176. """
  177. if not self.w2n or not self.n2w:
  178. raise Exception("No synonyms loaded")
  179. make_index(storage, indexname, self.w2n, self.n2w)
  180. def synonyms(self, word):
  181. """Returns a list of synonyms for the given word.
  182. >>> thesaurus.synonyms("hail")
  183. ['acclaim', 'come', 'herald']
  184. """
  185. word = word.lower()
  186. if self.searcher:
  187. return self.searcher.document(word=word)["syns"]
  188. else:
  189. return synonyms(self.w2n, self.n2w, word)