ppix_builder.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. from PyPPUB import ppub
  2. from PyPPUB import word_tree
  3. import struct
  4. class PpixBuilder:
  5. def __init__(self):
  6. self.words = {}
  7. self.collections = []
  8. self.tags = {}
  9. self.pubs = []
  10. self.alternative_locations = []
  11. self.word_tree = word_tree.WordBit()
  12. def add_publication(self, path, pub: ppub.Ppub):
  13. index = len(self.pubs)
  14. self.pubs.append((path, pub))
  15. pub_tags = pub.metadata.get_value("tags")
  16. if (pub_tags != None):
  17. for tag in pub_tags.split(" "):
  18. if(tag in self.tags):
  19. self.collections[self.tags[tag]].add(index)
  20. else:
  21. collection_index = len(self.collections)
  22. self.collections.append(set([index,]))
  23. self.tags[tag] = collection_index
  24. words = set()
  25. def add_to_set(word_soup):
  26. if(word_soup == None):
  27. return
  28. stripped = "".join((x if x.isalnum() else " ") for x in word_soup)
  29. for word in stripped.split(" "):
  30. words.add(word.lower())
  31. add_to_set(pub.metadata.get_value("title"))
  32. add_to_set(pub.metadata.get_value("description"))
  33. add_to_set(pub.metadata.get_value("author"))
  34. add_to_set(pub.get_asset_data(pub.default_asset).decode('utf-8').replace("\n", " "))
  35. words.remove("")
  36. print(words)
  37. for word in words:
  38. if(word in self.words):
  39. self.collections[self.words[word]].add(index)
  40. else:
  41. collection_index = len(self.collections)
  42. self.collections.append(set([index,]))
  43. self.words[word] = collection_index
  44. def add_alternative_location(self, url):
  45. self.alternative_locations.append(url)
  46. def write_out(self, stream):
  47. # Magic number
  48. stream.write(b"PPIX\x00")
  49. start = 35
  50. publication_index_start = start
  51. publication_index = self.serialise_publication_index(start)
  52. start += len(publication_index)
  53. collection_index_start = start
  54. collection_index = self.serialise_collections(start)
  55. start += len(collection_index)
  56. tag_index_start = start
  57. tag_index = self.serialise_tags()
  58. start += len(tag_index)
  59. alternative_location_index_start = start
  60. alternative_location_index = self.serialise_alternative_locations()
  61. start += len(alternative_location_index)
  62. stream.write(struct.pack("<IIII", publication_index_start, collection_index_start, tag_index_start, start))
  63. stream.write(b"ECMDATA\x00")
  64. stream.write(struct.pack("<HI", 1, alternative_location_index_start))
  65. stream.write(publication_index)
  66. stream.write(collection_index)
  67. stream.write(tag_index)
  68. stream.write(alternative_location_index)
  69. self.serialise_word_tree(start, stream)
  70. stream.flush()
  71. stream.close()
  72. def serialise_publication_index(self, start_position):
  73. data = struct.pack("<I", len(self.pubs))
  74. string_data_start = start_position + 4 + (len(self.pubs) * 6)
  75. string_data = b""
  76. for pub in self.pubs:
  77. encoded = pub[0].encode('utf-8')
  78. data += struct.pack("<IH", string_data_start + len(string_data), len(encoded))
  79. string_data += encoded
  80. return data + string_data
  81. def serialise_collections(self, start_position):
  82. index_data = b""
  83. collection_data_start = start_position + (len(self.collections) * 6)
  84. collection_data = b""
  85. for col in self.collections:
  86. index_data += struct.pack("<IH", collection_data_start + len(collection_data), len(col))
  87. for pub_id in col:
  88. collection_data += struct.pack("<I", pub_id)
  89. return index_data + collection_data
  90. def serialise_tags(self):
  91. data = struct.pack("<H", len(self.tags))
  92. for key, value in self.tags.items():
  93. encoded = key.encode("utf-8")
  94. data += struct.pack("<BI", len(encoded), value)
  95. data += encoded
  96. return data
  97. def serialise_alternative_locations(self):
  98. data = struct.pack("<H", len(self.alternative_locations))
  99. for location in self.alternative_locations:
  100. encoded = location.encode("utf-8")
  101. data += struct.pack("<H", len(encoded))
  102. data += encoded
  103. return data
  104. def serialise_word_tree(self, start_position, stream):
  105. words = sorted(((self.string_to_bool_array(k), v) for k, v in self.words.items()), key=lambda x: x[0][0])
  106. root = word_tree.WordBit()
  107. nodes = {"": root}
  108. for word in words:
  109. last_bit = None
  110. for i in range(len(word[0][0])):
  111. key = word[0][0][:i+1]
  112. if(key in nodes):
  113. last_bit = key
  114. continue
  115. last_bit = word_tree.WordBit()
  116. past_key = word[0][0][:i]
  117. if(word[0][1][i]):
  118. nodes[past_key].next_1 = last_bit
  119. else:
  120. nodes[past_key].next_0 = last_bit
  121. nodes[key] = last_bit
  122. last_bit.collection = word[1]
  123. root.position = start_position
  124. node_array = [root,]
  125. del nodes[""]
  126. counter = root.position + word_tree.WordBit.SIZE
  127. for node in nodes.values():
  128. node.position = counter
  129. node_array.append(node)
  130. counter += word_tree.WordBit.SIZE
  131. for node in node_array:
  132. stream.write(node.serialise())
  133. def string_to_bool_array(self, string):
  134. data = string.encode("utf-8")
  135. array = []
  136. for byte in data:
  137. for i in [1,2,4,8,16,32,64,128]:
  138. array.append(byte & i == i)
  139. return ("".join(("1" if x else "0" for x in array)), array)
  140. if(__name__ == "__main__"):
  141. a = PpixBuilder()
  142. import glob
  143. paths = glob.glob("ppubs/*.ppub")
  144. for path in paths:
  145. a.add_publication(path.split("/")[-1], ppub.Ppub.from_stream(open(path, 'rb')))
  146. f = open("lib.ppix", 'wb')
  147. a.write_out(f)
  148. import ppix
  149. f = open("lib.ppix", 'rb')
  150. ix = ppix.Ppix(f)
  151. print("{} publication(s)".format(ix.get_publications_count()))
  152. print("{} tag(s)".format(ix.get_tags_count()))
  153. for tag in ix.get_tags():
  154. print(tag)
  155. word = "ethics"
  156. col = ix.find_word_matches(word)
  157. if(col != None):
  158. print("The following publications contain the word '{}'".format(word))
  159. for pub_id in ix.get_collection_by_id(col):
  160. print(ix.get_publication_by_id(pub_id))