ppix_builder.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. from PyPPUB import ppub
  2. from PyPPUB import word_tree
  3. import struct
  4. class PpixBuilder:
  5. def __init__(self):
  6. self.words = {}
  7. self.collections = []
  8. self.tags = {}
  9. self.pubs = []
  10. self.word_tree = word_tree.WordBit()
  11. def add_publication(self, path, pub: ppub.Ppub):
  12. index = len(self.pubs)
  13. self.pubs.append((path, pub))
  14. pub_tags = pub.metadata.get_value("tags")
  15. if (pub_tags != None):
  16. for tag in pub_tags.split(" "):
  17. if(tag in self.tags):
  18. self.collections[self.tags[tag]].add(index)
  19. else:
  20. collection_index = len(self.collections)
  21. self.collections.append(set([index,]))
  22. self.tags[tag] = collection_index
  23. words = set()
  24. def add_to_set(word_soup):
  25. if(word_soup == None):
  26. return
  27. stripped = "".join((x if x.isalnum() else " ") for x in word_soup)
  28. for word in stripped.split(" "):
  29. words.add(word.lower())
  30. add_to_set(pub.metadata.get_value("title"))
  31. add_to_set(pub.metadata.get_value("description"))
  32. add_to_set(pub.metadata.get_value("author"))
  33. add_to_set(pub.get_asset_data(pub.default_asset).decode('utf-8').replace("\n", " "))
  34. words.remove("")
  35. print(words)
  36. for word in words:
  37. if(word in self.words):
  38. self.collections[self.words[word]].add(index)
  39. else:
  40. collection_index = len(self.collections)
  41. self.collections.append(set([index,]))
  42. self.words[word] = collection_index
  43. def write_out(self, stream):
  44. # Magic number
  45. stream.write(b"PPIX\x00")
  46. start = 21
  47. publication_index_start = start
  48. publication_index = self.serialise_publication_index(start)
  49. start += len(publication_index)
  50. collection_index_start = start
  51. collection_index = self.serialise_collections(start)
  52. start += len(collection_index)
  53. tag_index_start = start
  54. tag_index = self.serialise_tags()
  55. start += len(tag_index)
  56. stream.write(struct.pack("<IIII", publication_index_start, collection_index_start, tag_index_start, start))
  57. stream.write(publication_index)
  58. stream.write(collection_index)
  59. stream.write(tag_index)
  60. self.serialise_word_tree(start, stream)
  61. stream.flush()
  62. stream.close()
  63. def serialise_publication_index(self, start_position):
  64. data = struct.pack("<I", len(self.pubs))
  65. string_data_start = start_position + 4 + (len(self.pubs) * 6)
  66. string_data = b""
  67. for pub in self.pubs:
  68. encoded = pub[0].encode('utf-8')
  69. data += struct.pack("<IH", string_data_start + len(string_data), len(encoded))
  70. string_data += encoded
  71. return data + string_data
  72. def serialise_collections(self, start_position):
  73. index_data = b""
  74. collection_data_start = start_position + (len(self.collections) * 6)
  75. collection_data = b""
  76. for col in self.collections:
  77. index_data += struct.pack("<IH", collection_data_start + len(collection_data), len(col))
  78. for pub_id in col:
  79. collection_data += struct.pack("<I", pub_id)
  80. return index_data + collection_data
  81. def serialise_tags(self):
  82. data = struct.pack("<H", len(self.tags))
  83. for key, value in self.tags.items():
  84. encoded = key.encode("utf-8")
  85. data += struct.pack("<BI", len(encoded), value)
  86. data += encoded
  87. return data
  88. def serialise_word_tree(self, start_position, stream):
  89. words = sorted(((self.string_to_bool_array(k), v) for k, v in self.words.items()), key=lambda x: x[0][0])
  90. root = word_tree.WordBit()
  91. nodes = {"": root}
  92. for word in words:
  93. last_bit = None
  94. for i in range(len(word[0][0])):
  95. key = word[0][0][:i+1]
  96. if(key in nodes):
  97. last_bit = key
  98. continue
  99. last_bit = word_tree.WordBit()
  100. past_key = word[0][0][:i]
  101. if(word[0][1][i]):
  102. nodes[past_key].next_1 = last_bit
  103. else:
  104. nodes[past_key].next_0 = last_bit
  105. nodes[key] = last_bit
  106. last_bit.collection = word[1]
  107. root.position = start_position
  108. node_array = [root,]
  109. del nodes[""]
  110. counter = root.position + word_tree.WordBit.SIZE
  111. for node in nodes.values():
  112. node.position = counter
  113. node_array.append(node)
  114. counter += word_tree.WordBit.SIZE
  115. for node in node_array:
  116. stream.write(node.serialise())
  117. def string_to_bool_array(self, string):
  118. data = string.encode("utf-8")
  119. array = []
  120. for byte in data:
  121. for i in [1,2,4,8,16,32,64,128]:
  122. array.append(byte & i == i)
  123. return ("".join(("1" if x else "0" for x in array)), array)
  124. if(__name__ == "__main__"):
  125. a = PpixBuilder()
  126. import glob
  127. paths = glob.glob("ppubs/*.ppub")
  128. for path in paths:
  129. a.add_publication(path.split("/")[-1], ppub.Ppub.from_stream(open(path, 'rb')))
  130. f = open("lib.ppix", 'wb')
  131. a.write_out(f)
  132. import ppix
  133. f = open("lib.ppix", 'rb')
  134. ix = ppix.Ppix(f)
  135. print("{} publication(s)".format(ix.get_publications_count()))
  136. print("{} tag(s)".format(ix.get_tags_count()))
  137. for tag in ix.get_tags():
  138. print(tag);
  139. word = "ethics"
  140. col = ix.find_word_matches(word)
  141. if(col != None):
  142. print("The following publications contain the word '{}'".format(word))
  143. for pub_id in ix.get_collection_by_id(col):
  144. print(ix.get_publication_by_id(pub_id))